In [76]:
### IMPORTS
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import log_loss
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import average_precision_score
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
import shap
In [2]:
### Reading the CSV file as pandas Dataframe
telco_url = "https://raw.githubusercontent.com/adamcookeunc/DATA_780_PROJECT/refs/heads/main/WA_Fn-UseC_-Telco-Customer-Churn_V4.csv"
telco_churn_df = pd.read_csv(telco_url)
telco_churn_df.head(10)
Out[2]:
| Customer ID | Gender | Age | Under 30 | Senior Citizen | Partner | Dependents | Number of Dependents | Referred a Friend | Number of Referrals | Tenure | Phone Service | Avg Monthly Long Distance Charges | Multiple Lines | Internet Service | Internet Type | Avg Monthly GB Download | Online Security | Online Backup | Device Protection Plan | Premium Tech Support | Streaming TV | Streaming Movies | Streaming Music | Unlimited Data | Contract | Paperless Billing | Payment Method | Monthly Charges | Total Charges | Total Refunds | Total Extra Data Charges | Total Long Distance Charges | Total Revenue | Churn | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0002-ORFBO | Female | 37 | No | No | Yes | Yes | 0 | Yes | 2 | 9 | Yes | 42.39 | No | Yes | Cable | 16 | No | Yes | No | Yes | Yes | No | No | Yes | One year | Yes | Mailed check | 65.60 | 593.30 | 0.00 | 0 | 381.51 | 974.81 | No |
| 1 | 0003-MKNFE | Male | 46 | No | No | No | No | 0 | No | 0 | 9 | Yes | 10.69 | Yes | Yes | Cable | 10 | No | No | No | No | No | Yes | Yes | No | Month-to-month | No | Mailed check | 59.90 | 542.40 | 38.33 | 10 | 96.21 | 610.28 | No |
| 2 | 0004-TLHLJ | Male | 50 | No | No | No | No | 0 | No | 0 | 4 | Yes | 33.65 | No | Yes | Fiber Optic | 30 | No | No | Yes | No | No | No | No | Yes | Month-to-month | Yes | Electronic check | 73.90 | 280.85 | 0.00 | 0 | 134.60 | 415.45 | Yes |
| 3 | 0011-IGKFF | Male | 78 | No | Yes | Yes | No | 0 | Yes | 1 | 13 | Yes | 27.82 | No | Yes | Fiber Optic | 4 | No | Yes | Yes | No | Yes | Yes | No | Yes | Month-to-month | Yes | Electronic check | 98.00 | 1237.85 | 0.00 | 0 | 361.66 | 1599.51 | Yes |
| 4 | 0013-EXCHZ | Female | 75 | No | Yes | Yes | No | 0 | Yes | 3 | 3 | Yes | 7.38 | No | Yes | Fiber Optic | 11 | No | No | No | Yes | Yes | No | No | Yes | Month-to-month | Yes | Mailed check | 83.90 | 267.40 | 0.00 | 0 | 22.14 | 289.54 | Yes |
| 5 | 0013-MHZWF | Female | 23 | Yes | No | No | Yes | 3 | No | 0 | 9 | Yes | 16.77 | No | Yes | Cable | 73 | No | No | No | Yes | Yes | Yes | Yes | Yes | Month-to-month | Yes | Credit card (automatic) | 69.40 | 571.45 | 0.00 | 0 | 150.93 | 722.38 | No |
| 6 | 0013-SMEOE | Female | 67 | No | Yes | Yes | No | 0 | Yes | 1 | 71 | Yes | 9.96 | No | Yes | Fiber Optic | 14 | Yes | Yes | Yes | Yes | Yes | Yes | Yes | Yes | Two year | Yes | Bank transfer (automatic) | 109.70 | 7904.25 | 0.00 | 0 | 707.16 | 8611.41 | No |
| 7 | 0014-BMAQU | Male | 52 | No | No | Yes | No | 0 | Yes | 8 | 63 | Yes | 12.96 | Yes | Yes | Fiber Optic | 7 | Yes | No | No | Yes | No | No | No | No | Two year | Yes | Credit card (automatic) | 84.65 | 5377.80 | 0.00 | 20 | 816.48 | 6214.28 | No |
| 8 | 0015-UOCOJ | Female | 68 | No | Yes | No | No | 0 | No | 0 | 7 | Yes | 10.53 | No | Yes | DSL | 21 | Yes | No | No | No | No | No | No | Yes | Month-to-month | Yes | Electronic check | 48.20 | 340.35 | 0.00 | 0 | 73.71 | 414.06 | No |
| 9 | 0016-QLJIS | Female | 43 | No | No | Yes | Yes | 1 | Yes | 3 | 65 | Yes | 28.46 | Yes | Yes | Cable | 14 | Yes | Yes | Yes | Yes | Yes | Yes | Yes | Yes | Two year | Yes | Mailed check | 90.45 | 5957.90 | 0.00 | 0 | 1849.90 | 7807.80 | No |
In [3]:
### Dropping 11 rows with missing data
telco_churn_df = telco_churn_df.dropna()
### Dropping the ID column because it is irrelavent
telco_churn_df = telco_churn_df.drop(columns = ['Customer ID'])
### Ensuring that all numerical columns are in the float 64 formatting.
telco_churn_df['Age'] = telco_churn_df['Age'].astype(float)
telco_churn_df['Number of Dependents'] = telco_churn_df['Number of Dependents'].astype(float)
telco_churn_df['Number of Referrals'] = telco_churn_df['Number of Referrals'].astype(float)
telco_churn_df['Tenure'] = telco_churn_df['Tenure'].astype(float)
telco_churn_df['Avg Monthly Long Distance Charges'] = telco_churn_df['Avg Monthly Long Distance Charges'].astype(float)
telco_churn_df['Avg Monthly GB Download'] = telco_churn_df['Avg Monthly GB Download'].astype(float)
telco_churn_df['Monthly Charges'] = telco_churn_df['Monthly Charges'].astype(float)
telco_churn_df['Total Charges'] = pd.to_numeric(telco_churn_df['Total Charges'], errors = 'coerce')
telco_churn_df['Total Refunds'] = telco_churn_df['Total Refunds'].astype(float)
telco_churn_df['Total Extra Data Charges'] = telco_churn_df['Total Extra Data Charges'].astype(float)
telco_churn_df['Total Long Distance Charges'] = telco_churn_df['Total Long Distance Charges'].astype(float)
telco_churn_df['Total Revenue'] = pd.to_numeric(telco_churn_df['Total Revenue'], errors = 'coerce')
### Converting Yes or No values to binary float64 formatting.
telco_churn_df['Under 30'] = telco_churn_df['Under 30'].map({'Yes': 1.0, 'No': 0.0})
telco_churn_df['Senior Citizen'] = telco_churn_df['Senior Citizen'].map({'Yes': 1.0, 'No': 0.0})
telco_churn_df['Partner'] = telco_churn_df['Partner'].map({'Yes': 1.0, 'No': 0.0})
telco_churn_df['Dependents'] = telco_churn_df['Dependents'].map({'Yes': 1.0, 'No': 0.0})
telco_churn_df['Referred a Friend'] = telco_churn_df['Referred a Friend'].map({'Yes': 1.0, 'No': 0.0})
telco_churn_df['Phone Service'] = telco_churn_df['Phone Service'].map({'Yes': 1.0, 'No': 0.0})
telco_churn_df['Internet Service'] = telco_churn_df['Internet Service'].map({'Yes': 1.0, 'No': 0.0})
telco_churn_df['Online Security'] = telco_churn_df['Online Security'].map({'Yes': 1.0, 'No': 0.0})
telco_churn_df['Online Backup'] = telco_churn_df['Online Backup'].map({'Yes': 1.0, 'No': 0.0})
telco_churn_df['Device Protection Plan'] = telco_churn_df['Device Protection Plan'].map({'Yes': 1.0, 'No': 0.0})
telco_churn_df['Premium Tech Support'] = telco_churn_df['Premium Tech Support'].map({'Yes': 1.0, 'No': 0.0})
telco_churn_df['Streaming TV'] = telco_churn_df['Streaming TV'].map({'Yes': 1.0, 'No': 0.0})
telco_churn_df['Streaming Movies'] = telco_churn_df['Streaming Movies'].map({'Yes': 1.0, 'No': 0.0})
telco_churn_df['Streaming Music'] = telco_churn_df['Streaming Music'].map({'Yes': 1.0, 'No': 0.0})
telco_churn_df['Unlimited Data'] = telco_churn_df['Unlimited Data'].map({'Yes': 1.0, 'No': 0.0})
telco_churn_df['Paperless Billing'] = telco_churn_df['Paperless Billing'].map({'Yes': 1.0, 'No': 0.0})
telco_churn_df['Churn'] = telco_churn_df['Churn'].map({'Yes': 1.0, 'No': 0.0})
### One-Hot-Encoding categorical/nominal columns
telco_churn_df_encoded = pd.get_dummies(telco_churn_df, columns = ['Gender', 'Multiple Lines', 'Internet Type', 'Contract', 'Payment Method'], dtype = float)
### Excluding the churn column (the column that we are predicting)
telco_churn_df_encoded_excluding_churn = telco_churn_df_encoded.drop(columns = ['Churn'])
In [4]:
### Displaying samples of each dataframe
print("telco_churn_df:")
display(telco_churn_df.head())
print('\n')
print("telco_churn_df_encoded:")
display(telco_churn_df_encoded.head())
print('\n')
print("telco_churn_df_encoded_excluding_churn:")
display(telco_churn_df_encoded_excluding_churn.head())
print('\n')
telco_churn_df:
| Gender | Age | Under 30 | Senior Citizen | Partner | Dependents | Number of Dependents | Referred a Friend | Number of Referrals | Tenure | Phone Service | Avg Monthly Long Distance Charges | Multiple Lines | Internet Service | Internet Type | Avg Monthly GB Download | Online Security | Online Backup | Device Protection Plan | Premium Tech Support | Streaming TV | Streaming Movies | Streaming Music | Unlimited Data | Contract | Paperless Billing | Payment Method | Monthly Charges | Total Charges | Total Refunds | Total Extra Data Charges | Total Long Distance Charges | Total Revenue | Churn | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Female | 37.0 | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 1.0 | 2.0 | 9.0 | 1.0 | 42.39 | No | 1.0 | Cable | 16.0 | 0.0 | 1.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 1.0 | One year | 1.0 | Mailed check | 65.6 | 593.30 | 0.00 | 0.0 | 381.51 | 974.81 | 0.0 |
| 1 | Male | 46.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 9.0 | 1.0 | 10.69 | Yes | 1.0 | Cable | 10.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | Month-to-month | 0.0 | Mailed check | 59.9 | 542.40 | 38.33 | 10.0 | 96.21 | 610.28 | 0.0 |
| 2 | Male | 50.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 4.0 | 1.0 | 33.65 | No | 1.0 | Fiber Optic | 30.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | Month-to-month | 1.0 | Electronic check | 73.9 | 280.85 | 0.00 | 0.0 | 134.60 | 415.45 | 1.0 |
| 3 | Male | 78.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 1.0 | 1.0 | 13.0 | 1.0 | 27.82 | No | 1.0 | Fiber Optic | 4.0 | 0.0 | 1.0 | 1.0 | 0.0 | 1.0 | 1.0 | 0.0 | 1.0 | Month-to-month | 1.0 | Electronic check | 98.0 | 1237.85 | 0.00 | 0.0 | 361.66 | 1599.51 | 1.0 |
| 4 | Female | 75.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 1.0 | 3.0 | 3.0 | 1.0 | 7.38 | No | 1.0 | Fiber Optic | 11.0 | 0.0 | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 1.0 | Month-to-month | 1.0 | Mailed check | 83.9 | 267.40 | 0.00 | 0.0 | 22.14 | 289.54 | 1.0 |
telco_churn_df_encoded:
| Age | Under 30 | Senior Citizen | Partner | Dependents | Number of Dependents | Referred a Friend | Number of Referrals | Tenure | Phone Service | Avg Monthly Long Distance Charges | Internet Service | Avg Monthly GB Download | Online Security | Online Backup | Device Protection Plan | Premium Tech Support | Streaming TV | Streaming Movies | Streaming Music | Unlimited Data | Paperless Billing | Monthly Charges | Total Charges | Total Refunds | Total Extra Data Charges | Total Long Distance Charges | Total Revenue | Churn | Gender_Female | Gender_Male | Multiple Lines_No | Multiple Lines_No phone service | Multiple Lines_Yes | Internet Type_Cable | Internet Type_DSL | Internet Type_Fiber Optic | Internet Type_No Internet | Contract_Month-to-month | Contract_One year | Contract_Two year | Payment Method_Bank transfer (automatic) | Payment Method_Credit card (automatic) | Payment Method_Electronic check | Payment Method_Mailed check | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 37.0 | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 1.0 | 2.0 | 9.0 | 1.0 | 42.39 | 1.0 | 16.0 | 0.0 | 1.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 1.0 | 1.0 | 65.6 | 593.30 | 0.00 | 0.0 | 381.51 | 974.81 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
| 1 | 46.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 9.0 | 1.0 | 10.69 | 1.0 | 10.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 59.9 | 542.40 | 38.33 | 10.0 | 96.21 | 610.28 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
| 2 | 50.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 4.0 | 1.0 | 33.65 | 1.0 | 30.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 1.0 | 73.9 | 280.85 | 0.00 | 0.0 | 134.60 | 415.45 | 1.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 |
| 3 | 78.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 1.0 | 1.0 | 13.0 | 1.0 | 27.82 | 1.0 | 4.0 | 0.0 | 1.0 | 1.0 | 0.0 | 1.0 | 1.0 | 0.0 | 1.0 | 1.0 | 98.0 | 1237.85 | 0.00 | 0.0 | 361.66 | 1599.51 | 1.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 |
| 4 | 75.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 1.0 | 3.0 | 3.0 | 1.0 | 7.38 | 1.0 | 11.0 | 0.0 | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 1.0 | 1.0 | 83.9 | 267.40 | 0.00 | 0.0 | 22.14 | 289.54 | 1.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
telco_churn_df_encoded_excluding_churn:
| Age | Under 30 | Senior Citizen | Partner | Dependents | Number of Dependents | Referred a Friend | Number of Referrals | Tenure | Phone Service | Avg Monthly Long Distance Charges | Internet Service | Avg Monthly GB Download | Online Security | Online Backup | Device Protection Plan | Premium Tech Support | Streaming TV | Streaming Movies | Streaming Music | Unlimited Data | Paperless Billing | Monthly Charges | Total Charges | Total Refunds | Total Extra Data Charges | Total Long Distance Charges | Total Revenue | Gender_Female | Gender_Male | Multiple Lines_No | Multiple Lines_No phone service | Multiple Lines_Yes | Internet Type_Cable | Internet Type_DSL | Internet Type_Fiber Optic | Internet Type_No Internet | Contract_Month-to-month | Contract_One year | Contract_Two year | Payment Method_Bank transfer (automatic) | Payment Method_Credit card (automatic) | Payment Method_Electronic check | Payment Method_Mailed check | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 37.0 | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 1.0 | 2.0 | 9.0 | 1.0 | 42.39 | 1.0 | 16.0 | 0.0 | 1.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 1.0 | 1.0 | 65.6 | 593.30 | 0.00 | 0.0 | 381.51 | 974.81 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
| 1 | 46.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 9.0 | 1.0 | 10.69 | 1.0 | 10.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 59.9 | 542.40 | 38.33 | 10.0 | 96.21 | 610.28 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
| 2 | 50.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 4.0 | 1.0 | 33.65 | 1.0 | 30.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 1.0 | 73.9 | 280.85 | 0.00 | 0.0 | 134.60 | 415.45 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 |
| 3 | 78.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 1.0 | 1.0 | 13.0 | 1.0 | 27.82 | 1.0 | 4.0 | 0.0 | 1.0 | 1.0 | 0.0 | 1.0 | 1.0 | 0.0 | 1.0 | 1.0 | 98.0 | 1237.85 | 0.00 | 0.0 | 361.66 | 1599.51 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 |
| 4 | 75.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 1.0 | 3.0 | 3.0 | 1.0 | 7.38 | 1.0 | 11.0 | 0.0 | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 1.0 | 1.0 | 83.9 | 267.40 | 0.00 | 0.0 | 22.14 | 289.54 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
In [5]:
### Summary statistics
print("Summary statistics:")
display(telco_churn_df.describe(include = 'all'))
print('\n')
### Data types
print("Data types:")
display(telco_churn_df.dtypes)
print('\n')
### Number of missing values
print("Number of missing values:")
display(telco_churn_df.isna().sum())
print('\n')
### Number of unique values
print("Number of unique values:")
display(telco_churn_df.nunique())
print('\n')
Summary statistics:
| Gender | Age | Under 30 | Senior Citizen | Partner | Dependents | Number of Dependents | Referred a Friend | Number of Referrals | Tenure | Phone Service | Avg Monthly Long Distance Charges | Multiple Lines | Internet Service | Internet Type | Avg Monthly GB Download | Online Security | Online Backup | Device Protection Plan | Premium Tech Support | Streaming TV | Streaming Movies | Streaming Music | Unlimited Data | Contract | Paperless Billing | Payment Method | Monthly Charges | Total Charges | Total Refunds | Total Extra Data Charges | Total Long Distance Charges | Total Revenue | Churn | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 7032 | 7032.000000 | 7032.000000 | 7032.000000 | 7032.000000 | 7032.000000 | 7032.000000 | 7032.000000 | 7032.000000 | 7032.000000 | 7032.000000 | 7032.000000 | 7032 | 7032.000000 | 7032 | 7032.000000 | 7032.000000 | 7032.000000 | 7032.000000 | 7032.000000 | 7032.000000 | 7032.000000 | 7032.000000 | 7032.000000 | 7032 | 7032.000000 | 7032 | 7032.000000 | 7032.000000 | 7032.000000 | 7032.000000 | 7032.000000 | 7032.000000 | 7032.000000 |
| unique | 2 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 3 | NaN | 4 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 3 | NaN | 4 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| top | Male | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | No | NaN | Fiber Optic | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | Month-to-month | NaN | Electronic check | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| freq | 3549 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 3385 | NaN | 3035 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 3875 | NaN | 2365 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| mean | NaN | 46.521331 | 0.198663 | 0.162400 | 0.482509 | 0.298493 | 0.467577 | 0.456911 | 1.949232 | 32.421786 | 0.903299 | 22.963471 | NaN | 0.783845 | NaN | 20.531712 | 0.286547 | 0.344852 | 0.343857 | 0.290102 | 0.384386 | 0.388367 | 0.353669 | 0.674061 | NaN | 0.592719 | NaN | 64.798208 | 2283.300441 | 1.965252 | 6.871445 | 749.957096 | 3038.163730 | 0.265785 |
| std | NaN | 16.751596 | 0.399022 | 0.368844 | 0.499729 | 0.457629 | 0.962134 | 0.498175 | 3.001324 | 24.545260 | 0.295571 | 15.449368 | NaN | 0.411650 | NaN | 20.419561 | 0.452180 | 0.475354 | 0.475028 | 0.453842 | 0.486484 | 0.487414 | 0.478142 | 0.468758 | NaN | 0.491363 | NaN | 30.085974 | 2266.771362 | 7.908412 | 25.123141 | 847.025001 | 2865.830234 | 0.441782 |
| min | NaN | 19.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 0.000000 | 0.000000 | NaN | 0.000000 | NaN | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | NaN | 0.000000 | NaN | 18.250000 | 18.800000 | 0.000000 | 0.000000 | 0.000000 | 21.360000 | 0.000000 |
| 25% | NaN | 32.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 9.000000 | 1.000000 | 9.210000 | NaN | 1.000000 | NaN | 3.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | NaN | 0.000000 | NaN | 35.587500 | 401.450000 | 0.000000 | 0.000000 | 70.567500 | 607.275000 | 0.000000 |
| 50% | NaN | 46.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 29.000000 | 1.000000 | 22.890000 | NaN | 1.000000 | NaN | 17.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | NaN | 1.000000 | NaN | 70.350000 | 1397.475000 | 0.000000 | 0.000000 | 403.875000 | 2111.300000 | 0.000000 |
| 75% | NaN | 60.000000 | 0.000000 | 0.000000 | 1.000000 | 1.000000 | 0.000000 | 1.000000 | 3.000000 | 55.000000 | 1.000000 | 36.412500 | NaN | 1.000000 | NaN | 27.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | NaN | 1.000000 | NaN | 89.862500 | 3794.737500 | 0.000000 | 0.000000 | 1192.432500 | 4808.797500 | 1.000000 |
| max | NaN | 80.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 9.000000 | 1.000000 | 11.000000 | 72.000000 | 1.000000 | 49.990000 | NaN | 1.000000 | NaN | 85.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | NaN | 1.000000 | NaN | 118.750000 | 8684.800000 | 49.790000 | 150.000000 | 3564.720000 | 11979.340000 | 1.000000 |
Data types:
Gender object Age float64 Under 30 float64 Senior Citizen float64 Partner float64 Dependents float64 Number of Dependents float64 Referred a Friend float64 Number of Referrals float64 Tenure float64 Phone Service float64 Avg Monthly Long Distance Charges float64 Multiple Lines object Internet Service float64 Internet Type object Avg Monthly GB Download float64 Online Security float64 Online Backup float64 Device Protection Plan float64 Premium Tech Support float64 Streaming TV float64 Streaming Movies float64 Streaming Music float64 Unlimited Data float64 Contract object Paperless Billing float64 Payment Method object Monthly Charges float64 Total Charges float64 Total Refunds float64 Total Extra Data Charges float64 Total Long Distance Charges float64 Total Revenue float64 Churn float64 dtype: object
Number of missing values:
Gender 0 Age 0 Under 30 0 Senior Citizen 0 Partner 0 Dependents 0 Number of Dependents 0 Referred a Friend 0 Number of Referrals 0 Tenure 0 Phone Service 0 Avg Monthly Long Distance Charges 0 Multiple Lines 0 Internet Service 0 Internet Type 0 Avg Monthly GB Download 0 Online Security 0 Online Backup 0 Device Protection Plan 0 Premium Tech Support 0 Streaming TV 0 Streaming Movies 0 Streaming Music 0 Unlimited Data 0 Contract 0 Paperless Billing 0 Payment Method 0 Monthly Charges 0 Total Charges 0 Total Refunds 0 Total Extra Data Charges 0 Total Long Distance Charges 0 Total Revenue 0 Churn 0 dtype: int64
Number of unique values:
Gender 2 Age 62 Under 30 2 Senior Citizen 2 Partner 2 Dependents 2 Number of Dependents 10 Referred a Friend 2 Number of Referrals 12 Tenure 72 Phone Service 2 Avg Monthly Long Distance Charges 3582 Multiple Lines 3 Internet Service 2 Internet Type 4 Avg Monthly GB Download 50 Online Security 2 Online Backup 2 Device Protection Plan 2 Premium Tech Support 2 Streaming TV 2 Streaming Movies 2 Streaming Music 2 Unlimited Data 2 Contract 3 Paperless Billing 2 Payment Method 4 Monthly Charges 1584 Total Charges 6530 Total Refunds 500 Total Extra Data Charges 16 Total Long Distance Charges 6059 Total Revenue 6964 Churn 2 dtype: int64
In [6]:
### Checking for duplicates
telco_churn_df.duplicated().sum()
Out[6]:
0
In [7]:
### Distribution of Churn (The target variable)
plt.figure(figsize = (8, 6))
sns.countplot(x = 'Churn', data = telco_churn_df)
plt.title('Churn Distribution')
plt.show()
print('\n')
print("Churn Distribution:")
print(telco_churn_df['Churn'].value_counts())
print('\n')
print("Churn Distribution Percentages:")
print(telco_churn_df['Churn'].value_counts(normalize = True) * 100)
print('\n')
Churn Distribution: Churn 0.0 5163 1.0 1869 Name: count, dtype: int64 Churn Distribution Percentages: Churn 0.0 73.421502 1.0 26.578498 Name: proportion, dtype: float64
In [8]:
### Distribution of Numerical Features
# Set up the figure with 4 rows and 3 columns
plt.figure(figsize = (20, 25))
### Age Distribution
plt.subplot(4, 3, 1)
sns.histplot(telco_churn_df['Age'], kde = True)
plt.axvline(telco_churn_df['Age'].mean(), color = 'red', linestyle = '--', label = 'Mean')
plt.axvline(telco_churn_df['Age'].median(), color = 'green', linestyle = '-', label = 'Median')
plt.legend()
plt.title('Age Distribution')
### Number of Dependents Distribution
plt.subplot(4, 3, 2)
sns.histplot(telco_churn_df['Number of Dependents'], kde = True)
plt.axvline(telco_churn_df['Number of Dependents'].mean(), color='red', linestyle='--', label='Mean')
plt.axvline(telco_churn_df['Number of Dependents'].median(), color='green', linestyle='-', label='Median')
plt.legend()
plt.title('Number of Dependents Distribution')
### Number of Referrals Distribution
plt.subplot(4, 3, 3)
sns.histplot(telco_churn_df['Number of Referrals'], kde = True)
plt.axvline(telco_churn_df['Number of Referrals'].mean(), color = 'red', linestyle = '--', label = 'Mean')
plt.axvline(telco_churn_df['Number of Referrals'].median(), color = 'green', linestyle = '-', label = 'Median')
plt.legend()
plt.title('Number of Referrals Distribution')
### Tenure Distribution Distribution
plt.subplot(4, 3, 4)
sns.histplot(telco_churn_df['Tenure'], kde = True)
plt.axvline(telco_churn_df['Tenure'].mean(), color = 'red', linestyle = '--', label = 'Mean')
plt.axvline(telco_churn_df['Tenure'].median(), color = 'green', linestyle = '-', label = 'Median')
plt.legend()
plt.title('Tenure Distribution')
### Avg Monthly Long Distance Charges Distribution
plt.subplot(4, 3, 5)
sns.histplot(telco_churn_df['Avg Monthly Long Distance Charges'], kde = True)
plt.axvline(telco_churn_df['Avg Monthly Long Distance Charges'].mean(), color = 'red', linestyle = '--', label = 'Mean')
plt.axvline(telco_churn_df['Avg Monthly Long Distance Charges'].median(), color = 'green', linestyle = '-', label = 'Median')
plt.legend()
plt.title('Avg Monthly Long Distance Charges Distribution')
### Avg Monthly GB Download Distribution
plt.subplot(4, 3, 6)
sns.histplot(telco_churn_df['Avg Monthly GB Download'], kde = True)
plt.axvline(telco_churn_df['Avg Monthly GB Download'].mean(), color = 'red', linestyle = '--', label = 'Mean')
plt.axvline(telco_churn_df['Avg Monthly GB Download'].median(), color = 'green', linestyle = '-', label = 'Median')
plt.legend()
plt.title('Avg Monthly GB Download Distribution')
### Monthly Charges Distribution
plt.subplot(4, 3, 7)
sns.histplot(telco_churn_df['Monthly Charges'], kde = True)
plt.axvline(telco_churn_df['Monthly Charges'].mean(), color = 'red', linestyle = '--', label = 'Mean')
plt.axvline(telco_churn_df['Monthly Charges'].median(), color = 'green', linestyle = '-', label = 'Median')
plt.legend()
plt.title('Monthly Charges Distribution')
### Total Charges Distribution
plt.subplot(4, 3, 8)
sns.histplot(telco_churn_df['Total Charges'], kde = True)
plt.axvline(telco_churn_df['Total Charges'].mean(), color = 'red', linestyle = '--', label = 'Mean')
plt.axvline(telco_churn_df['Total Charges'].median(), color = 'green', linestyle = '-', label = 'Median')
plt.legend()
plt.title('Total Charges Distribution')
### Total Refunds Distribution
plt.subplot(4, 3, 9)
sns.histplot(telco_churn_df['Total Refunds'], kde = True)
plt.axvline(telco_churn_df['Total Refunds'].mean(), color = 'red', linestyle = '--', label = 'Mean')
plt.axvline(telco_churn_df['Total Refunds'].median(), color = 'green', linestyle = '-', label = 'Median')
plt.legend()
plt.title('Total Refunds Distribution')
### Total Extra Data Charges Distribution
plt.subplot(4, 3, 10)
sns.histplot(telco_churn_df['Total Extra Data Charges'], kde = True)
plt.axvline(telco_churn_df['Total Extra Data Charges'].mean(), color = 'red', linestyle = '--', label = 'Mean')
plt.axvline(telco_churn_df['Total Extra Data Charges'].median(), color = 'green', linestyle = '-', label = 'Median')
plt.legend()
plt.title('Total Extra Data Charges Distribution')
### Total Long Distance Charges Distribution
plt.subplot(4, 3, 11)
sns.histplot(telco_churn_df['Total Long Distance Charges'], kde = True)
plt.axvline(telco_churn_df['Total Long Distance Charges'].mean(), color = 'red', linestyle = '--', label = 'Mean')
plt.axvline(telco_churn_df['Total Long Distance Charges'].median(), color = 'green', linestyle = '-', label = 'Median')
plt.legend()
plt.title('Total Long Distance Charges Distribution')
### Total Revenue Distribution
plt.subplot(4, 3, 12)
sns.histplot(telco_churn_df['Total Revenue'], kde = True)
plt.axvline(telco_churn_df['Total Revenue'].mean(), color = 'red', linestyle = '--', label = 'Mean')
plt.axvline(telco_churn_df['Total Revenue'].median(), color = 'green', linestyle = '-', label = 'Median')
plt.legend()
plt.title('Total Revenue Distribution')
plt.tight_layout()
plt.show()
In [9]:
### Correlation Heatmap
plt.figure(figsize=(20, 15))
correlation_matrix = telco_churn_df_encoded.corr()
sns.heatmap(correlation_matrix, annot=False, cmap='coolwarm', linewidths=0.5)
plt.title('Correlation Heatmap')
plt.show()
In [10]:
### Correlation matrix
print("Correlation Matrix:")
display(correlation_matrix)
print('\n')
Correlation Matrix:
| Age | Under 30 | Senior Citizen | Partner | Dependents | Number of Dependents | Referred a Friend | Number of Referrals | Tenure | Phone Service | Avg Monthly Long Distance Charges | Internet Service | Avg Monthly GB Download | Online Security | Online Backup | Device Protection Plan | Premium Tech Support | Streaming TV | Streaming Movies | Streaming Music | Unlimited Data | Paperless Billing | Monthly Charges | Total Charges | Total Refunds | Total Extra Data Charges | Total Long Distance Charges | Total Revenue | Churn | Gender_Female | Gender_Male | Multiple Lines_No | Multiple Lines_No phone service | Multiple Lines_Yes | Internet Type_Cable | Internet Type_DSL | Internet Type_Fiber Optic | Internet Type_No Internet | Contract_Month-to-month | Contract_One year | Contract_Two year | Payment Method_Bank transfer (automatic) | Payment Method_Credit card (automatic) | Payment Method_Electronic check | Payment Method_Mailed check | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Age | 1.000000 | -0.672368 | 0.681788 | -0.003175 | -0.143419 | -0.117923 | -0.004176 | -0.024628 | 0.009308 | 0.010614 | -0.011633 | 0.116397 | -0.377404 | -0.038820 | 0.035111 | 0.028283 | -0.044310 | 0.059545 | 0.073113 | -0.155987 | 0.083587 | 0.100100 | 0.144117 | 0.059152 | 0.024014 | 0.024866 | 0.002667 | 0.047727 | 0.115458 | -0.002238 | 0.002238 | -0.095962 | -0.010614 | 0.103438 | -0.039784 | -0.056101 | 0.170614 | -0.116397 | 0.093841 | -0.042538 | -0.068806 | 0.001521 | -0.024970 | 0.123758 | -0.116342 |
| Under 30 | -0.672368 | 1.000000 | -0.219243 | 0.014933 | 0.047516 | 0.034748 | 0.016238 | 0.023268 | 0.000272 | -0.009538 | 0.009618 | -0.034663 | 0.523659 | 0.032866 | 0.005430 | 0.004226 | 0.021776 | -0.016109 | -0.009177 | 0.124437 | -0.030920 | -0.037742 | -0.043745 | -0.013608 | -0.011951 | 0.012494 | -0.001276 | -0.010998 | -0.054300 | 0.002891 | -0.002891 | 0.029621 | 0.009538 | -0.035675 | 0.020235 | 0.024908 | -0.063283 | 0.034663 | -0.037849 | 0.027656 | 0.017745 | -0.004599 | 0.023229 | -0.042879 | 0.030021 |
| Senior Citizen | 0.681788 | -0.219243 | 1.000000 | 0.016957 | -0.210550 | -0.167915 | 0.010223 | -0.026212 | 0.015683 | 0.008392 | -0.000842 | 0.182519 | -0.102318 | -0.038576 | 0.066663 | 0.059514 | -0.060577 | 0.105445 | 0.119842 | -0.148300 | 0.140025 | 0.156258 | 0.219874 | 0.102411 | 0.028400 | 0.032735 | 0.010094 | 0.084195 | 0.150541 | 0.001819 | -0.001819 | -0.136377 | -0.008392 | 0.142996 | -0.047384 | -0.074308 | 0.246086 | -0.182519 | 0.137752 | -0.046491 | -0.116205 | -0.016235 | -0.024359 | 0.171322 | -0.152987 |
| Partner | -0.003175 | 0.014933 | 0.016957 | 1.000000 | 0.452269 | 0.323767 | 0.949904 | 0.672637 | 0.381912 | 0.018397 | 0.010481 | 0.000286 | 0.057354 | 0.143346 | 0.141849 | 0.153556 | 0.120206 | 0.124483 | 0.118108 | 0.089287 | 0.019374 | -0.013957 | 0.097825 | 0.319072 | 0.040437 | 0.018184 | 0.265229 | 0.330814 | -0.149982 | 0.001379 | -0.001379 | -0.130028 | -0.018397 | 0.142561 | 0.002648 | -0.000117 | -0.001387 | -0.000286 | -0.280202 | 0.083067 | 0.247334 | 0.111406 | 0.082327 | -0.083207 | -0.096948 |
| Dependents | -0.143419 | 0.047516 | -0.210550 | 0.452269 | 1.000000 | 0.723428 | 0.432923 | 0.332458 | 0.163386 | -0.001078 | 0.005319 | -0.138383 | 0.070469 | 0.080786 | 0.023639 | 0.013900 | 0.063053 | -0.016499 | -0.038375 | 0.007572 | -0.092725 | -0.110131 | -0.112343 | 0.064653 | 0.012737 | -0.018348 | 0.110315 | 0.083547 | -0.163128 | -0.010349 | 0.010349 | 0.023388 | 0.001078 | -0.024307 | 0.028477 | 0.030875 | -0.159950 | 0.138383 | -0.229715 | 0.069222 | 0.201699 | 0.052369 | 0.061134 | -0.149274 | 0.056448 |
| Number of Dependents | -0.117923 | 0.034748 | -0.167915 | 0.323767 | 0.723428 | 1.000000 | 0.308802 | 0.277538 | 0.109527 | -0.011027 | -0.007596 | -0.152002 | 0.130379 | 0.043098 | 0.001905 | -0.019481 | 0.022196 | -0.052526 | -0.064282 | -0.032111 | -0.112996 | -0.106157 | -0.130953 | 0.023576 | 0.014333 | -0.014141 | 0.069757 | 0.039102 | -0.218468 | -0.005786 | 0.005786 | 0.024037 | 0.011027 | -0.030918 | 0.007966 | 0.008177 | -0.138502 | 0.152002 | -0.152413 | 0.015161 | 0.163149 | 0.046441 | 0.033681 | -0.124778 | 0.061649 |
| Referred a Friend | -0.004176 | 0.016238 | 0.010223 | 0.949904 | 0.432923 | 0.308802 | 1.000000 | 0.708110 | 0.363433 | 0.016130 | 0.008983 | 0.003818 | 0.055655 | 0.139739 | 0.143537 | 0.153972 | 0.121977 | 0.119700 | 0.116079 | 0.090557 | 0.019636 | -0.010113 | 0.095144 | 0.305563 | 0.038567 | 0.020478 | 0.251822 | 0.316191 | -0.148614 | 0.003756 | -0.003756 | -0.122066 | -0.016130 | 0.133149 | 0.009048 | 0.003377 | -0.005606 | -0.003818 | -0.271210 | 0.080294 | 0.239499 | 0.107248 | 0.074911 | -0.082537 | -0.086326 |
| Number of Referrals | -0.024628 | 0.023268 | -0.026212 | 0.672637 | 0.332458 | 0.277538 | 0.708110 | 1.000000 | 0.328232 | 0.009696 | 0.006229 | -0.038353 | 0.038974 | 0.142768 | 0.112362 | 0.116494 | 0.108234 | 0.079508 | 0.058008 | 0.051463 | -0.005394 | -0.044016 | 0.028697 | 0.251395 | 0.024994 | 0.000591 | 0.216944 | 0.262901 | -0.286305 | 0.008797 | -0.008797 | -0.071710 | -0.009696 | 0.078353 | 0.005743 | 0.022895 | -0.055193 | 0.038353 | -0.272396 | 0.068921 | 0.251721 | 0.092910 | 0.087839 | -0.123959 | -0.038233 |
| Tenure | 0.009308 | 0.000272 | 0.015683 | 0.381912 | 0.163386 | 0.109527 | 0.363433 | 0.328232 | 1.000000 | 0.007877 | 0.014349 | 0.037529 | 0.048458 | 0.328297 | 0.361138 | 0.361520 | 0.325288 | 0.280264 | 0.285402 | 0.237577 | 0.030579 | 0.004823 | 0.246862 | 0.825880 | 0.058708 | 0.081934 | 0.673900 | 0.852977 | -0.354049 | -0.005285 | 0.005285 | -0.323891 | -0.007877 | 0.332399 | -0.006318 | 0.011338 | 0.025606 | -0.037529 | -0.649346 | 0.202338 | 0.563801 | 0.243822 | 0.232800 | -0.210197 | -0.232181 |
| Phone Service | 0.010614 | -0.009538 | 0.008392 | 0.018397 | -0.001078 | -0.011027 | 0.016130 | 0.009696 | 0.007877 | 1.000000 | 0.486359 | -0.171817 | -0.103015 | -0.091676 | -0.052133 | -0.070076 | -0.095138 | -0.021383 | -0.033477 | -0.039757 | -0.121786 | 0.016696 | 0.248033 | 0.113008 | 0.006228 | -0.030788 | 0.289715 | 0.174727 | 0.011691 | 0.007515 | -0.007515 | 0.315218 | -1.000000 | 0.279530 | -0.220581 | -0.332405 | 0.285109 | 0.171817 | -0.001243 | -0.003142 | 0.004442 | 0.008271 | -0.006916 | 0.002747 | -0.004463 |
| Avg Monthly Long Distance Charges | -0.011633 | 0.009618 | -0.000842 | 0.010481 | 0.005319 | -0.007596 | 0.008983 | 0.006229 | 0.014349 | 0.486359 | 1.000000 | -0.068246 | -0.030119 | -0.027711 | -0.013654 | -0.031059 | -0.047340 | 0.002456 | 0.003230 | 0.007681 | -0.052841 | 0.018215 | 0.138690 | 0.069385 | -0.018731 | 0.002336 | 0.600024 | 0.232296 | 0.007952 | -0.009366 | 0.009366 | 0.153362 | -0.486359 | 0.135898 | -0.094702 | -0.163963 | 0.145169 | 0.068246 | -0.010838 | 0.030077 | -0.016037 | -0.002221 | -0.004703 | 0.015201 | -0.010311 |
| Internet Service | 0.116397 | -0.034663 | 0.182519 | 0.000286 | -0.138383 | -0.152002 | 0.003818 | -0.038353 | 0.037529 | -0.171817 | -0.068246 | 1.000000 | 0.528052 | 0.332799 | 0.380990 | 0.380151 | 0.335695 | 0.414951 | 0.418450 | 0.388453 | 0.755178 | 0.320592 | 0.763191 | 0.374878 | 0.020239 | 0.143639 | -0.013827 | 0.293632 | 0.227578 | 0.004745 | -0.004745 | -0.309984 | 0.171817 | 0.210794 | 0.191975 | 0.290532 | 0.457593 | -1.000000 | 0.217824 | -0.038061 | -0.217542 | 0.001094 | -0.001870 | 0.284608 | -0.319694 |
| Avg Monthly GB Download | -0.377404 | 0.523659 | -0.102318 | 0.057354 | 0.070469 | 0.130379 | 0.055655 | 0.038974 | 0.048458 | -0.103015 | -0.030119 | 0.528052 | 1.000000 | 0.235085 | 0.228900 | 0.225504 | 0.223415 | 0.227287 | 0.231100 | 0.324045 | 0.394649 | 0.142602 | 0.391170 | 0.222811 | 0.002200 | 0.088792 | 0.010964 | 0.180249 | 0.048438 | 0.013443 | -0.013443 | -0.151071 | 0.103015 | 0.091192 | 0.131599 | 0.189522 | 0.191074 | -0.528052 | 0.061760 | 0.009575 | -0.081092 | 0.012071 | 0.021877 | 0.109406 | -0.156553 |
| Online Security | -0.038820 | 0.032866 | -0.038576 | 0.143346 | 0.080786 | 0.043098 | 0.139739 | 0.142768 | 0.328297 | -0.091676 | -0.027711 | 0.332799 | 0.235085 | 1.000000 | 0.283285 | 0.274875 | 0.354458 | 0.175514 | 0.187426 | 0.195611 | 0.264217 | -0.004051 | 0.296447 | 0.412619 | 0.008264 | 0.059019 | 0.200974 | 0.386261 | -0.171270 | 0.016328 | -0.016328 | -0.151678 | 0.091676 | 0.098592 | 0.147712 | 0.239652 | -0.024556 | -0.332799 | -0.246844 | 0.100658 | 0.191698 | 0.094366 | 0.115473 | -0.112295 | -0.079918 |
| Online Backup | 0.035111 | 0.005430 | 0.066663 | 0.141849 | 0.023639 | 0.001905 | 0.143537 | 0.112362 | 0.361138 | -0.052133 | -0.013654 | 0.380990 | 0.228900 | 0.283285 | 1.000000 | 0.303058 | 0.293705 | 0.281601 | 0.274523 | 0.245519 | 0.283018 | 0.127056 | 0.441529 | 0.510100 | 0.016885 | 0.095951 | 0.240238 | 0.475271 | -0.082307 | 0.013093 | -0.013093 | -0.230724 | 0.052133 | 0.202228 | 0.072474 | 0.119140 | 0.167546 | -0.380990 | -0.164393 | 0.084113 | 0.111391 | 0.086942 | 0.090455 | -0.000364 | -0.174075 |
| Device Protection Plan | 0.028283 | 0.004226 | 0.059514 | 0.153556 | 0.013900 | -0.019481 | 0.153972 | 0.116494 | 0.361520 | -0.070076 | -0.031059 | 0.380151 | 0.225504 | 0.274875 | 0.303058 | 1.000000 | 0.332850 | 0.389924 | 0.402309 | 0.349307 | 0.295807 | 0.104079 | 0.482607 | 0.522881 | 0.026076 | 0.074066 | 0.210496 | 0.476373 | -0.066193 | 0.000807 | -0.000807 | -0.240847 | 0.070076 | 0.201733 | 0.071434 | 0.109779 | 0.175531 | -0.380151 | -0.225988 | 0.102911 | 0.165248 | 0.083047 | 0.111252 | -0.003308 | -0.187325 |
| Premium Tech Support | -0.044310 | 0.021776 | -0.060577 | 0.120206 | 0.063053 | 0.022196 | 0.121977 | 0.108234 | 0.325288 | -0.095138 | -0.047340 | 0.335695 | 0.223415 | 0.354458 | 0.293705 | 0.332850 | 1.000000 | 0.277549 | 0.280155 | 0.276926 | 0.250647 | 0.037536 | 0.338301 | 0.432868 | 0.034919 | 0.088094 | 0.182633 | 0.397039 | -0.164716 | 0.008507 | -0.008507 | -0.155534 | 0.095138 | 0.100421 | 0.152082 | 0.228523 | -0.015476 | -0.335695 | -0.285491 | 0.096258 | 0.240924 | 0.100472 | 0.117024 | -0.114807 | -0.084631 |
| Streaming TV | 0.059545 | -0.016109 | 0.105445 | 0.124483 | -0.016499 | -0.052526 | 0.119700 | 0.079508 | 0.280264 | -0.021383 | 0.002456 | 0.414951 | 0.227287 | 0.175514 | 0.281601 | 0.389924 | 0.277549 | 1.000000 | 0.533380 | 0.455546 | 0.322453 | 0.224241 | 0.629668 | 0.515709 | 0.021798 | 0.072341 | 0.182613 | 0.462455 | 0.063254 | 0.007124 | -0.007124 | -0.267466 | 0.021383 | 0.257804 | 0.024789 | 0.006579 | 0.323082 | -0.414951 | -0.112550 | 0.061930 | 0.072124 | 0.046121 | 0.040010 | 0.144747 | -0.247712 |
| Streaming Movies | 0.073113 | -0.009177 | 0.119842 | 0.118108 | -0.038375 | -0.064282 | 0.116079 | 0.058008 | 0.285402 | -0.033477 | 0.003230 | 0.418450 | 0.231100 | 0.187426 | 0.274523 | 0.402309 | 0.280155 | 0.533380 | 1.000000 | 0.848367 | 0.318182 | 0.211583 | 0.627235 | 0.519867 | 0.013359 | 0.094243 | 0.185992 | 0.466957 | 0.060860 | 0.010105 | -0.010105 | -0.275995 | 0.033477 | 0.259194 | 0.021755 | 0.019955 | 0.316525 | -0.418450 | -0.117867 | 0.064780 | 0.075603 | 0.048755 | 0.048398 | 0.137420 | -0.250290 |
| Streaming Music | -0.155987 | 0.124437 | -0.148300 | 0.089287 | 0.007572 | -0.032111 | 0.090557 | 0.051463 | 0.237577 | -0.039757 | 0.007681 | 0.388453 | 0.324045 | 0.195611 | 0.245519 | 0.349307 | 0.276926 | 0.455546 | 0.848367 | 1.000000 | 0.296729 | 0.167027 | 0.535984 | 0.440169 | 0.003000 | 0.079455 | 0.150947 | 0.393460 | 0.045107 | 0.007241 | -0.007241 | -0.215000 | 0.039757 | 0.193726 | 0.045018 | 0.056984 | 0.244781 | -0.388453 | -0.121075 | 0.059512 | 0.084362 | 0.044312 | 0.046286 | 0.097943 | -0.199388 |
| Unlimited Data | 0.083587 | -0.030920 | 0.140025 | 0.019374 | -0.092725 | -0.112996 | 0.019636 | -0.005394 | 0.030579 | -0.121786 | -0.052841 | 0.755178 | 0.394649 | 0.264217 | 0.283018 | 0.295807 | 0.250647 | 0.322453 | 0.318182 | 0.296729 | 1.000000 | 0.244843 | 0.581878 | 0.287919 | 0.010931 | -0.393357 | -0.009551 | 0.221432 | 0.166327 | -0.000459 | 0.000459 | -0.229947 | 0.121786 | 0.159759 | 0.146017 | 0.214257 | 0.349286 | -0.755178 | 0.157990 | -0.029993 | -0.155511 | 0.007037 | 0.008660 | 0.200256 | -0.240912 |
| Paperless Billing | 0.100100 | -0.037742 | 0.156258 | -0.013957 | -0.110131 | -0.106157 | -0.010113 | -0.044016 | 0.004823 | 0.016696 | 0.018215 | 0.320592 | 0.142602 | -0.004051 | 0.127056 | 0.104079 | 0.037536 | 0.224241 | 0.211583 | 0.167027 | 0.244843 | 1.000000 | 0.351930 | 0.157830 | 0.003718 | 0.038479 | 0.018234 | 0.130554 | 0.191454 | 0.011902 | -0.011902 | -0.151974 | -0.016696 | 0.163746 | -0.011097 | -0.053843 | 0.319703 | -0.320592 | 0.168296 | -0.052278 | -0.146281 | -0.017469 | -0.013726 | 0.208427 | -0.203981 |
| Monthly Charges | 0.144117 | -0.043745 | 0.219874 | 0.097825 | -0.112343 | -0.130953 | 0.095144 | 0.028697 | 0.246862 | 0.248033 | 0.138690 | 0.763191 | 0.391170 | 0.296447 | 0.441529 | 0.482607 | 0.338301 | 0.629668 | 0.627235 | 0.535984 | 0.581878 | 0.351930 | 1.000000 | 0.651065 | 0.033213 | 0.126036 | 0.245880 | 0.588655 | 0.192858 | 0.013779 | -0.013779 | -0.338514 | -0.248033 | 0.490912 | -0.054821 | -0.122094 | 0.774365 | -0.763191 | 0.058933 | 0.004810 | -0.073256 | 0.042410 | 0.030055 | 0.271117 | -0.376568 |
| Total Charges | 0.059152 | -0.013608 | 0.102411 | 0.319072 | 0.064653 | 0.023576 | 0.305563 | 0.251395 | 0.825880 | 0.113008 | 0.069385 | 0.374878 | 0.222811 | 0.412619 | 0.510100 | 0.522881 | 0.432868 | 0.515709 | 0.519867 | 0.440169 | 0.287919 | 0.157830 | 0.651065 | 1.000000 | 0.039262 | 0.121580 | 0.609900 | 0.972185 | -0.199484 | -0.000048 | 0.000048 | -0.396765 | -0.113008 | 0.469042 | -0.031504 | -0.037005 | 0.363704 | -0.374878 | -0.446776 | 0.170569 | 0.358036 | 0.186119 | 0.182663 | -0.060436 | -0.294708 |
| Total Refunds | 0.024014 | -0.011951 | 0.028400 | 0.040437 | 0.012737 | 0.014333 | 0.038567 | 0.024994 | 0.058708 | 0.006228 | -0.018731 | 0.020239 | 0.002200 | 0.008264 | 0.016885 | 0.026076 | 0.034919 | 0.021798 | 0.013359 | 0.003000 | 0.010931 | 0.003718 | 0.033213 | 0.039262 | 1.000000 | 0.016651 | 0.027872 | 0.036679 | -0.033954 | -0.004760 | 0.004760 | -0.041270 | -0.006228 | 0.045481 | -0.013338 | 0.004225 | 0.021890 | -0.020239 | -0.023067 | 0.005273 | 0.021853 | 0.022677 | -0.003659 | 0.012217 | -0.032526 |
| Total Extra Data Charges | 0.024866 | 0.012494 | 0.032735 | 0.018184 | -0.018348 | -0.014141 | 0.020478 | 0.000591 | 0.081934 | -0.030788 | 0.002336 | 0.143639 | 0.088792 | 0.059019 | 0.095951 | 0.074066 | 0.088094 | 0.072341 | 0.094243 | 0.079455 | -0.393357 | 0.038479 | 0.126036 | 0.121580 | 0.016651 | 1.000000 | 0.058618 | 0.122211 | 0.006885 | 0.001888 | -0.001888 | -0.081910 | 0.030788 | 0.064445 | 0.011297 | 0.057408 | 0.062919 | -0.143639 | -0.012028 | 0.025119 | -0.009925 | 0.014560 | 0.002592 | 0.040008 | -0.061947 |
| Total Long Distance Charges | 0.002667 | -0.001276 | 0.010094 | 0.265229 | 0.110315 | 0.069757 | 0.251822 | 0.216944 | 0.673900 | 0.289715 | 0.600024 | -0.013827 | 0.010964 | 0.200974 | 0.240238 | 0.210496 | 0.182633 | 0.182613 | 0.185992 | 0.150947 | -0.009551 | 0.018234 | 0.245880 | 0.609900 | 0.027872 | 0.058618 | 1.000000 | 0.778407 | -0.224507 | -0.011040 | 0.011040 | -0.148240 | -0.289715 | 0.323353 | -0.067501 | -0.085356 | 0.105457 | 0.013827 | -0.434258 | 0.155948 | 0.357385 | 0.157272 | 0.156925 | -0.128974 | -0.163840 |
| Total Revenue | 0.047727 | -0.010998 | 0.084195 | 0.330814 | 0.083547 | 0.039102 | 0.316191 | 0.262901 | 0.852977 | 0.174727 | 0.232296 | 0.293632 | 0.180249 | 0.386261 | 0.475271 | 0.476373 | 0.397039 | 0.462455 | 0.466957 | 0.393460 | 0.221432 | 0.130554 | 0.588655 | 0.972185 | 0.036679 | 0.122211 | 0.778407 | 1.000000 | -0.223986 | -0.003271 | 0.003271 | -0.358245 | -0.174727 | 0.467006 | -0.044733 | -0.054006 | 0.319337 | -0.293632 | -0.481775 | 0.181212 | 0.388675 | 0.193762 | 0.190894 | -0.085605 | -0.281982 |
| Churn | 0.115458 | -0.054300 | 0.150541 | -0.149982 | -0.163128 | -0.218468 | -0.148614 | -0.286305 | -0.354049 | 0.011691 | 0.007952 | 0.227578 | 0.048438 | -0.171270 | -0.082307 | -0.066193 | -0.164716 | 0.063254 | 0.060860 | 0.045107 | 0.166327 | 0.191454 | 0.192858 | -0.199484 | -0.033954 | 0.006885 | -0.224507 | -0.223986 | 1.000000 | 0.008545 | -0.008545 | -0.032654 | -0.011691 | 0.040033 | -0.007323 | -0.099565 | 0.279049 | -0.227578 | 0.404565 | -0.178225 | -0.301552 | -0.118136 | -0.134687 | 0.301455 | -0.090773 |
| Gender_Female | -0.002238 | 0.002891 | 0.001819 | 0.001379 | -0.010349 | -0.005786 | 0.003756 | 0.008797 | -0.005285 | 0.007515 | -0.009366 | 0.004745 | 0.013443 | 0.016328 | 0.013093 | 0.000807 | 0.008507 | 0.007124 | 0.010105 | 0.007241 | -0.000459 | 0.011902 | 0.013779 | -0.000048 | -0.004760 | 0.001888 | -0.011040 | -0.003271 | 0.008545 | 1.000000 | -1.000000 | -0.004335 | -0.007515 | 0.008883 | 0.010928 | -0.016965 | 0.011337 | -0.004745 | 0.003251 | -0.007755 | 0.003603 | 0.015973 | -0.001632 | -0.000844 | -0.013199 |
| Gender_Male | 0.002238 | -0.002891 | -0.001819 | -0.001379 | 0.010349 | 0.005786 | -0.003756 | -0.008797 | 0.005285 | -0.007515 | 0.009366 | -0.004745 | -0.013443 | -0.016328 | -0.013093 | -0.000807 | -0.008507 | -0.007124 | -0.010105 | -0.007241 | 0.000459 | -0.011902 | -0.013779 | 0.000048 | 0.004760 | -0.001888 | 0.011040 | 0.003271 | -0.008545 | -1.000000 | 1.000000 | 0.004335 | 0.007515 | -0.008883 | -0.010928 | 0.016965 | -0.011337 | 0.004745 | -0.003251 | 0.007755 | -0.003603 | -0.015973 | 0.001632 | 0.000844 | 0.013199 |
| Multiple Lines_No | -0.095962 | 0.029621 | -0.136377 | -0.130028 | 0.023388 | 0.024037 | -0.122066 | -0.071710 | -0.323891 | 0.315218 | 0.153362 | -0.309984 | -0.151071 | -0.151678 | -0.230724 | -0.240847 | -0.155534 | -0.267466 | -0.275995 | -0.215000 | -0.229947 | -0.151974 | -0.338514 | -0.396765 | -0.041270 | -0.081910 | -0.148240 | -0.358245 | -0.032654 | -0.004335 | 0.004335 | 1.000000 | -0.315218 | -0.823076 | -0.034470 | -0.055968 | -0.187305 | 0.309984 | 0.086798 | 0.001694 | -0.102756 | -0.069663 | -0.063712 | -0.080990 | 0.222395 |
| Multiple Lines_No phone service | -0.010614 | 0.009538 | -0.008392 | -0.018397 | 0.001078 | 0.011027 | -0.016130 | -0.009696 | -0.007877 | -1.000000 | -0.486359 | 0.171817 | 0.103015 | 0.091676 | 0.052133 | 0.070076 | 0.095138 | 0.021383 | 0.033477 | 0.039757 | 0.121786 | -0.016696 | -0.248033 | -0.113008 | -0.006228 | 0.030788 | -0.289715 | -0.174727 | -0.011691 | -0.007515 | 0.007515 | -0.315218 | 1.000000 | -0.279530 | 0.220581 | 0.332405 | -0.285109 | -0.171817 | 0.001243 | 0.003142 | -0.004442 | -0.008271 | 0.006916 | -0.002747 | 0.004463 |
| Multiple Lines_Yes | 0.103438 | -0.035675 | 0.142996 | 0.142561 | -0.024307 | -0.030918 | 0.133149 | 0.078353 | 0.332399 | 0.279530 | 0.135898 | 0.210794 | 0.091192 | 0.098592 | 0.202228 | 0.201733 | 0.100421 | 0.257804 | 0.259194 | 0.193726 | 0.159759 | 0.163746 | 0.490912 | 0.469042 | 0.045481 | 0.064445 | 0.323353 | 0.467006 | 0.040033 | 0.008883 | -0.008883 | -0.823076 | -0.279530 | 1.000000 | -0.097130 | -0.142301 | 0.360121 | -0.210794 | -0.088558 | -0.003594 | 0.106618 | 0.075429 | 0.060319 | 0.083583 | -0.227672 |
| Internet Type_Cable | -0.039784 | 0.020235 | -0.047384 | 0.002648 | 0.028477 | 0.007966 | 0.009048 | 0.005743 | -0.006318 | -0.220581 | -0.094702 | 0.191975 | 0.131599 | 0.147712 | 0.072474 | 0.071434 | 0.152082 | 0.024789 | 0.021755 | 0.045018 | 0.146017 | -0.011097 | -0.054821 | -0.031504 | -0.013338 | 0.011297 | -0.067501 | -0.044733 | -0.007323 | 0.010928 | -0.010928 | -0.034470 | 0.220581 | -0.097130 | 1.000000 | -0.202256 | -0.318558 | -0.191975 | -0.013142 | 0.012429 | 0.003467 | 0.000228 | 0.016805 | -0.037158 | 0.025123 |
| Internet Type_DSL | -0.056101 | 0.024908 | -0.074308 | -0.000117 | 0.030875 | 0.008177 | 0.003377 | 0.022895 | 0.011338 | -0.332405 | -0.163963 | 0.290532 | 0.189522 | 0.239652 | 0.119140 | 0.109779 | 0.228523 | 0.006579 | 0.019955 | 0.056984 | 0.214257 | -0.053843 | -0.122094 | -0.037005 | 0.004225 | 0.057408 | -0.085356 | -0.054006 | -0.099565 | -0.016965 | 0.016965 | -0.055968 | 0.332405 | -0.142301 | -0.202256 | 1.000000 | -0.482101 | -0.290532 | -0.048687 | 0.037981 | 0.020534 | 0.024845 | 0.034689 | -0.071239 | 0.021675 |
| Internet Type_Fiber Optic | 0.170614 | -0.063283 | 0.246086 | -0.001387 | -0.159950 | -0.138502 | -0.005606 | -0.055193 | 0.025606 | 0.285109 | 0.145169 | 0.457593 | 0.191074 | -0.024556 | 0.167546 | 0.175531 | -0.015476 | 0.323082 | 0.316525 | 0.244781 | 0.349286 | 0.319703 | 0.774365 | 0.363704 | 0.021890 | 0.062919 | 0.105457 | 0.319337 | 0.279049 | 0.011337 | -0.011337 | -0.187305 | -0.285109 | 0.360121 | -0.318558 | -0.482101 | 1.000000 | -0.457593 | 0.231219 | -0.072206 | -0.200608 | -0.020488 | -0.042163 | 0.321644 | -0.300577 |
| Internet Type_No Internet | -0.116397 | 0.034663 | -0.182519 | -0.000286 | 0.138383 | 0.152002 | -0.003818 | 0.038353 | -0.037529 | 0.171817 | 0.068246 | -1.000000 | -0.528052 | -0.332799 | -0.380990 | -0.380151 | -0.335695 | -0.414951 | -0.418450 | -0.388453 | -0.755178 | -0.320592 | -0.763191 | -0.374878 | -0.020239 | -0.143639 | 0.013827 | -0.293632 | -0.227578 | -0.004745 | 0.004745 | 0.309984 | -0.171817 | -0.210794 | -0.191975 | -0.290532 | -0.457593 | 1.000000 | -0.217824 | 0.038061 | 0.217542 | -0.001094 | 0.001870 | -0.284608 | 0.319694 |
| Contract_Month-to-month | 0.093841 | -0.037849 | 0.137752 | -0.280202 | -0.229715 | -0.152413 | -0.271210 | -0.272396 | -0.649346 | -0.001243 | -0.010838 | 0.217824 | 0.061760 | -0.246844 | -0.164393 | -0.225988 | -0.285491 | -0.112550 | -0.117867 | -0.121075 | 0.157990 | 0.168296 | 0.058933 | -0.446776 | -0.023067 | -0.012028 | -0.434258 | -0.481775 | 0.404565 | 0.003251 | -0.003251 | 0.086798 | 0.001243 | -0.088558 | -0.013142 | -0.048687 | 0.231219 | -0.217824 | 1.000000 | -0.570053 | -0.621933 | -0.180159 | -0.204960 | 0.330879 | 0.006209 |
| Contract_One year | -0.042538 | 0.027656 | -0.046491 | 0.083067 | 0.069222 | 0.015161 | 0.080294 | 0.068921 | 0.202338 | -0.003142 | 0.030077 | -0.038061 | 0.009575 | 0.100658 | 0.084113 | 0.102911 | 0.096258 | 0.061930 | 0.064780 | 0.059512 | -0.029993 | -0.052278 | 0.004810 | 0.170569 | 0.005273 | 0.025119 | 0.155948 | 0.181212 | -0.178225 | -0.007755 | 0.007755 | 0.001694 | 0.003142 | -0.003594 | 0.012429 | 0.037981 | -0.072206 | 0.038061 | -0.570053 | 1.000000 | -0.288843 | 0.057629 | 0.067590 | -0.109546 | 0.000197 |
| Contract_Two year | -0.068806 | 0.017745 | -0.116205 | 0.247334 | 0.201699 | 0.163149 | 0.239499 | 0.251721 | 0.563801 | 0.004442 | -0.016037 | -0.217542 | -0.081092 | 0.191698 | 0.111391 | 0.165248 | 0.240924 | 0.072124 | 0.075603 | 0.084362 | -0.155511 | -0.146281 | -0.073256 | 0.358036 | 0.021853 | -0.009925 | 0.357385 | 0.388675 | -0.301552 | 0.003603 | -0.003603 | -0.102756 | -0.004442 | 0.106618 | 0.003467 | 0.020534 | -0.200608 | 0.217542 | -0.621933 | -0.288843 | 1.000000 | 0.155004 | 0.174410 | -0.281147 | -0.007423 |
| Payment Method_Bank transfer (automatic) | 0.001521 | -0.004599 | -0.016235 | 0.111406 | 0.052369 | 0.046441 | 0.107248 | 0.092910 | 0.243822 | 0.008271 | -0.002221 | 0.001094 | 0.012071 | 0.094366 | 0.086942 | 0.083047 | 0.100472 | 0.046121 | 0.048755 | 0.044312 | 0.007037 | -0.017469 | 0.042410 | 0.186119 | 0.022677 | 0.014560 | 0.157272 | 0.193762 | -0.118136 | 0.015973 | -0.015973 | -0.069663 | -0.008271 | 0.075429 | 0.000228 | 0.024845 | -0.020488 | -0.001094 | -0.180159 | 0.057629 | 0.155004 | 1.000000 | -0.278423 | -0.377270 | -0.288097 |
| Payment Method_Credit card (automatic) | -0.024970 | 0.023229 | -0.024359 | 0.082327 | 0.061134 | 0.033681 | 0.074911 | 0.087839 | 0.232800 | -0.006916 | -0.004703 | -0.001870 | 0.021877 | 0.115473 | 0.090455 | 0.111252 | 0.117024 | 0.040010 | 0.048398 | 0.046286 | 0.008660 | -0.013726 | 0.030055 | 0.182663 | -0.003659 | 0.002592 | 0.156925 | 0.190894 | -0.134687 | -0.001632 | 0.001632 | -0.063712 | 0.006916 | 0.060319 | 0.016805 | 0.034689 | -0.042163 | 0.001870 | -0.204960 | 0.067590 | 0.174410 | -0.278423 | 1.000000 | -0.373978 | -0.285583 |
| Payment Method_Electronic check | 0.123758 | -0.042879 | 0.171322 | -0.083207 | -0.149274 | -0.124778 | -0.082537 | -0.123959 | -0.210197 | 0.002747 | 0.015201 | 0.284608 | 0.109406 | -0.112295 | -0.000364 | -0.003308 | -0.114807 | 0.144747 | 0.137420 | 0.097943 | 0.200256 | 0.208427 | 0.271117 | -0.060436 | 0.012217 | 0.040008 | -0.128974 | -0.085605 | 0.301455 | -0.000844 | 0.000844 | -0.080990 | -0.002747 | 0.083583 | -0.037158 | -0.071239 | 0.321644 | -0.284608 | 0.330879 | -0.109546 | -0.281147 | -0.377270 | -0.373978 | 1.000000 | -0.386971 |
| Payment Method_Mailed check | -0.116342 | 0.030021 | -0.152987 | -0.096948 | 0.056448 | 0.061649 | -0.086326 | -0.038233 | -0.232181 | -0.004463 | -0.010311 | -0.319694 | -0.156553 | -0.079918 | -0.174075 | -0.187325 | -0.084631 | -0.247712 | -0.250290 | -0.199388 | -0.240912 | -0.203981 | -0.376568 | -0.294708 | -0.032526 | -0.061947 | -0.163840 | -0.281982 | -0.090773 | -0.013199 | 0.013199 | 0.222395 | 0.004463 | -0.227672 | 0.025123 | 0.021675 | -0.300577 | 0.319694 | 0.006209 | 0.000197 | -0.007423 | -0.288097 | -0.285583 | -0.386971 | 1.000000 |
In [11]:
### Correlation matrix of high correlated columns
plt.figure(figsize=(20, 15))
absolute_correlation_matrix = correlation_matrix.abs()
np.fill_diagonal(absolute_correlation_matrix.values, 0)
high_correlation_columns = ((absolute_correlation_matrix > 0.5) & (absolute_correlation_matrix < 1.0)).any(axis=0)
high_correlation_columns = absolute_correlation_matrix.columns[high_correlation_columns]
high_correlation_matrix = telco_churn_df_encoded[high_correlation_columns].corr()
sns.heatmap(high_correlation_matrix, annot=False, cmap='coolwarm', linewidths=0.5)
plt.title('High Correlation Heatmap')
plt.show()
In [12]:
### High correlation matrix
print("High correlation matrix:")
display(high_correlation_matrix)
print('\n')
High correlation matrix:
| Age | Under 30 | Senior Citizen | Partner | Dependents | Number of Dependents | Referred a Friend | Number of Referrals | Tenure | Avg Monthly Long Distance Charges | Internet Service | Avg Monthly GB Download | Online Backup | Device Protection Plan | Streaming TV | Streaming Movies | Streaming Music | Unlimited Data | Monthly Charges | Total Charges | Total Long Distance Charges | Total Revenue | Multiple Lines_No | Multiple Lines_Yes | Internet Type_Fiber Optic | Internet Type_No Internet | Contract_Month-to-month | Contract_One year | Contract_Two year | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Age | 1.000000 | -0.672368 | 0.681788 | -0.003175 | -0.143419 | -0.117923 | -0.004176 | -0.024628 | 0.009308 | -0.011633 | 0.116397 | -0.377404 | 0.035111 | 0.028283 | 0.059545 | 0.073113 | -0.155987 | 0.083587 | 0.144117 | 0.059152 | 0.002667 | 0.047727 | -0.095962 | 0.103438 | 0.170614 | -0.116397 | 0.093841 | -0.042538 | -0.068806 |
| Under 30 | -0.672368 | 1.000000 | -0.219243 | 0.014933 | 0.047516 | 0.034748 | 0.016238 | 0.023268 | 0.000272 | 0.009618 | -0.034663 | 0.523659 | 0.005430 | 0.004226 | -0.016109 | -0.009177 | 0.124437 | -0.030920 | -0.043745 | -0.013608 | -0.001276 | -0.010998 | 0.029621 | -0.035675 | -0.063283 | 0.034663 | -0.037849 | 0.027656 | 0.017745 |
| Senior Citizen | 0.681788 | -0.219243 | 1.000000 | 0.016957 | -0.210550 | -0.167915 | 0.010223 | -0.026212 | 0.015683 | -0.000842 | 0.182519 | -0.102318 | 0.066663 | 0.059514 | 0.105445 | 0.119842 | -0.148300 | 0.140025 | 0.219874 | 0.102411 | 0.010094 | 0.084195 | -0.136377 | 0.142996 | 0.246086 | -0.182519 | 0.137752 | -0.046491 | -0.116205 |
| Partner | -0.003175 | 0.014933 | 0.016957 | 1.000000 | 0.452269 | 0.323767 | 0.949904 | 0.672637 | 0.381912 | 0.010481 | 0.000286 | 0.057354 | 0.141849 | 0.153556 | 0.124483 | 0.118108 | 0.089287 | 0.019374 | 0.097825 | 0.319072 | 0.265229 | 0.330814 | -0.130028 | 0.142561 | -0.001387 | -0.000286 | -0.280202 | 0.083067 | 0.247334 |
| Dependents | -0.143419 | 0.047516 | -0.210550 | 0.452269 | 1.000000 | 0.723428 | 0.432923 | 0.332458 | 0.163386 | 0.005319 | -0.138383 | 0.070469 | 0.023639 | 0.013900 | -0.016499 | -0.038375 | 0.007572 | -0.092725 | -0.112343 | 0.064653 | 0.110315 | 0.083547 | 0.023388 | -0.024307 | -0.159950 | 0.138383 | -0.229715 | 0.069222 | 0.201699 |
| Number of Dependents | -0.117923 | 0.034748 | -0.167915 | 0.323767 | 0.723428 | 1.000000 | 0.308802 | 0.277538 | 0.109527 | -0.007596 | -0.152002 | 0.130379 | 0.001905 | -0.019481 | -0.052526 | -0.064282 | -0.032111 | -0.112996 | -0.130953 | 0.023576 | 0.069757 | 0.039102 | 0.024037 | -0.030918 | -0.138502 | 0.152002 | -0.152413 | 0.015161 | 0.163149 |
| Referred a Friend | -0.004176 | 0.016238 | 0.010223 | 0.949904 | 0.432923 | 0.308802 | 1.000000 | 0.708110 | 0.363433 | 0.008983 | 0.003818 | 0.055655 | 0.143537 | 0.153972 | 0.119700 | 0.116079 | 0.090557 | 0.019636 | 0.095144 | 0.305563 | 0.251822 | 0.316191 | -0.122066 | 0.133149 | -0.005606 | -0.003818 | -0.271210 | 0.080294 | 0.239499 |
| Number of Referrals | -0.024628 | 0.023268 | -0.026212 | 0.672637 | 0.332458 | 0.277538 | 0.708110 | 1.000000 | 0.328232 | 0.006229 | -0.038353 | 0.038974 | 0.112362 | 0.116494 | 0.079508 | 0.058008 | 0.051463 | -0.005394 | 0.028697 | 0.251395 | 0.216944 | 0.262901 | -0.071710 | 0.078353 | -0.055193 | 0.038353 | -0.272396 | 0.068921 | 0.251721 |
| Tenure | 0.009308 | 0.000272 | 0.015683 | 0.381912 | 0.163386 | 0.109527 | 0.363433 | 0.328232 | 1.000000 | 0.014349 | 0.037529 | 0.048458 | 0.361138 | 0.361520 | 0.280264 | 0.285402 | 0.237577 | 0.030579 | 0.246862 | 0.825880 | 0.673900 | 0.852977 | -0.323891 | 0.332399 | 0.025606 | -0.037529 | -0.649346 | 0.202338 | 0.563801 |
| Avg Monthly Long Distance Charges | -0.011633 | 0.009618 | -0.000842 | 0.010481 | 0.005319 | -0.007596 | 0.008983 | 0.006229 | 0.014349 | 1.000000 | -0.068246 | -0.030119 | -0.013654 | -0.031059 | 0.002456 | 0.003230 | 0.007681 | -0.052841 | 0.138690 | 0.069385 | 0.600024 | 0.232296 | 0.153362 | 0.135898 | 0.145169 | 0.068246 | -0.010838 | 0.030077 | -0.016037 |
| Internet Service | 0.116397 | -0.034663 | 0.182519 | 0.000286 | -0.138383 | -0.152002 | 0.003818 | -0.038353 | 0.037529 | -0.068246 | 1.000000 | 0.528052 | 0.380990 | 0.380151 | 0.414951 | 0.418450 | 0.388453 | 0.755178 | 0.763191 | 0.374878 | -0.013827 | 0.293632 | -0.309984 | 0.210794 | 0.457593 | -1.000000 | 0.217824 | -0.038061 | -0.217542 |
| Avg Monthly GB Download | -0.377404 | 0.523659 | -0.102318 | 0.057354 | 0.070469 | 0.130379 | 0.055655 | 0.038974 | 0.048458 | -0.030119 | 0.528052 | 1.000000 | 0.228900 | 0.225504 | 0.227287 | 0.231100 | 0.324045 | 0.394649 | 0.391170 | 0.222811 | 0.010964 | 0.180249 | -0.151071 | 0.091192 | 0.191074 | -0.528052 | 0.061760 | 0.009575 | -0.081092 |
| Online Backup | 0.035111 | 0.005430 | 0.066663 | 0.141849 | 0.023639 | 0.001905 | 0.143537 | 0.112362 | 0.361138 | -0.013654 | 0.380990 | 0.228900 | 1.000000 | 0.303058 | 0.281601 | 0.274523 | 0.245519 | 0.283018 | 0.441529 | 0.510100 | 0.240238 | 0.475271 | -0.230724 | 0.202228 | 0.167546 | -0.380990 | -0.164393 | 0.084113 | 0.111391 |
| Device Protection Plan | 0.028283 | 0.004226 | 0.059514 | 0.153556 | 0.013900 | -0.019481 | 0.153972 | 0.116494 | 0.361520 | -0.031059 | 0.380151 | 0.225504 | 0.303058 | 1.000000 | 0.389924 | 0.402309 | 0.349307 | 0.295807 | 0.482607 | 0.522881 | 0.210496 | 0.476373 | -0.240847 | 0.201733 | 0.175531 | -0.380151 | -0.225988 | 0.102911 | 0.165248 |
| Streaming TV | 0.059545 | -0.016109 | 0.105445 | 0.124483 | -0.016499 | -0.052526 | 0.119700 | 0.079508 | 0.280264 | 0.002456 | 0.414951 | 0.227287 | 0.281601 | 0.389924 | 1.000000 | 0.533380 | 0.455546 | 0.322453 | 0.629668 | 0.515709 | 0.182613 | 0.462455 | -0.267466 | 0.257804 | 0.323082 | -0.414951 | -0.112550 | 0.061930 | 0.072124 |
| Streaming Movies | 0.073113 | -0.009177 | 0.119842 | 0.118108 | -0.038375 | -0.064282 | 0.116079 | 0.058008 | 0.285402 | 0.003230 | 0.418450 | 0.231100 | 0.274523 | 0.402309 | 0.533380 | 1.000000 | 0.848367 | 0.318182 | 0.627235 | 0.519867 | 0.185992 | 0.466957 | -0.275995 | 0.259194 | 0.316525 | -0.418450 | -0.117867 | 0.064780 | 0.075603 |
| Streaming Music | -0.155987 | 0.124437 | -0.148300 | 0.089287 | 0.007572 | -0.032111 | 0.090557 | 0.051463 | 0.237577 | 0.007681 | 0.388453 | 0.324045 | 0.245519 | 0.349307 | 0.455546 | 0.848367 | 1.000000 | 0.296729 | 0.535984 | 0.440169 | 0.150947 | 0.393460 | -0.215000 | 0.193726 | 0.244781 | -0.388453 | -0.121075 | 0.059512 | 0.084362 |
| Unlimited Data | 0.083587 | -0.030920 | 0.140025 | 0.019374 | -0.092725 | -0.112996 | 0.019636 | -0.005394 | 0.030579 | -0.052841 | 0.755178 | 0.394649 | 0.283018 | 0.295807 | 0.322453 | 0.318182 | 0.296729 | 1.000000 | 0.581878 | 0.287919 | -0.009551 | 0.221432 | -0.229947 | 0.159759 | 0.349286 | -0.755178 | 0.157990 | -0.029993 | -0.155511 |
| Monthly Charges | 0.144117 | -0.043745 | 0.219874 | 0.097825 | -0.112343 | -0.130953 | 0.095144 | 0.028697 | 0.246862 | 0.138690 | 0.763191 | 0.391170 | 0.441529 | 0.482607 | 0.629668 | 0.627235 | 0.535984 | 0.581878 | 1.000000 | 0.651065 | 0.245880 | 0.588655 | -0.338514 | 0.490912 | 0.774365 | -0.763191 | 0.058933 | 0.004810 | -0.073256 |
| Total Charges | 0.059152 | -0.013608 | 0.102411 | 0.319072 | 0.064653 | 0.023576 | 0.305563 | 0.251395 | 0.825880 | 0.069385 | 0.374878 | 0.222811 | 0.510100 | 0.522881 | 0.515709 | 0.519867 | 0.440169 | 0.287919 | 0.651065 | 1.000000 | 0.609900 | 0.972185 | -0.396765 | 0.469042 | 0.363704 | -0.374878 | -0.446776 | 0.170569 | 0.358036 |
| Total Long Distance Charges | 0.002667 | -0.001276 | 0.010094 | 0.265229 | 0.110315 | 0.069757 | 0.251822 | 0.216944 | 0.673900 | 0.600024 | -0.013827 | 0.010964 | 0.240238 | 0.210496 | 0.182613 | 0.185992 | 0.150947 | -0.009551 | 0.245880 | 0.609900 | 1.000000 | 0.778407 | -0.148240 | 0.323353 | 0.105457 | 0.013827 | -0.434258 | 0.155948 | 0.357385 |
| Total Revenue | 0.047727 | -0.010998 | 0.084195 | 0.330814 | 0.083547 | 0.039102 | 0.316191 | 0.262901 | 0.852977 | 0.232296 | 0.293632 | 0.180249 | 0.475271 | 0.476373 | 0.462455 | 0.466957 | 0.393460 | 0.221432 | 0.588655 | 0.972185 | 0.778407 | 1.000000 | -0.358245 | 0.467006 | 0.319337 | -0.293632 | -0.481775 | 0.181212 | 0.388675 |
| Multiple Lines_No | -0.095962 | 0.029621 | -0.136377 | -0.130028 | 0.023388 | 0.024037 | -0.122066 | -0.071710 | -0.323891 | 0.153362 | -0.309984 | -0.151071 | -0.230724 | -0.240847 | -0.267466 | -0.275995 | -0.215000 | -0.229947 | -0.338514 | -0.396765 | -0.148240 | -0.358245 | 1.000000 | -0.823076 | -0.187305 | 0.309984 | 0.086798 | 0.001694 | -0.102756 |
| Multiple Lines_Yes | 0.103438 | -0.035675 | 0.142996 | 0.142561 | -0.024307 | -0.030918 | 0.133149 | 0.078353 | 0.332399 | 0.135898 | 0.210794 | 0.091192 | 0.202228 | 0.201733 | 0.257804 | 0.259194 | 0.193726 | 0.159759 | 0.490912 | 0.469042 | 0.323353 | 0.467006 | -0.823076 | 1.000000 | 0.360121 | -0.210794 | -0.088558 | -0.003594 | 0.106618 |
| Internet Type_Fiber Optic | 0.170614 | -0.063283 | 0.246086 | -0.001387 | -0.159950 | -0.138502 | -0.005606 | -0.055193 | 0.025606 | 0.145169 | 0.457593 | 0.191074 | 0.167546 | 0.175531 | 0.323082 | 0.316525 | 0.244781 | 0.349286 | 0.774365 | 0.363704 | 0.105457 | 0.319337 | -0.187305 | 0.360121 | 1.000000 | -0.457593 | 0.231219 | -0.072206 | -0.200608 |
| Internet Type_No Internet | -0.116397 | 0.034663 | -0.182519 | -0.000286 | 0.138383 | 0.152002 | -0.003818 | 0.038353 | -0.037529 | 0.068246 | -1.000000 | -0.528052 | -0.380990 | -0.380151 | -0.414951 | -0.418450 | -0.388453 | -0.755178 | -0.763191 | -0.374878 | 0.013827 | -0.293632 | 0.309984 | -0.210794 | -0.457593 | 1.000000 | -0.217824 | 0.038061 | 0.217542 |
| Contract_Month-to-month | 0.093841 | -0.037849 | 0.137752 | -0.280202 | -0.229715 | -0.152413 | -0.271210 | -0.272396 | -0.649346 | -0.010838 | 0.217824 | 0.061760 | -0.164393 | -0.225988 | -0.112550 | -0.117867 | -0.121075 | 0.157990 | 0.058933 | -0.446776 | -0.434258 | -0.481775 | 0.086798 | -0.088558 | 0.231219 | -0.217824 | 1.000000 | -0.570053 | -0.621933 |
| Contract_One year | -0.042538 | 0.027656 | -0.046491 | 0.083067 | 0.069222 | 0.015161 | 0.080294 | 0.068921 | 0.202338 | 0.030077 | -0.038061 | 0.009575 | 0.084113 | 0.102911 | 0.061930 | 0.064780 | 0.059512 | -0.029993 | 0.004810 | 0.170569 | 0.155948 | 0.181212 | 0.001694 | -0.003594 | -0.072206 | 0.038061 | -0.570053 | 1.000000 | -0.288843 |
| Contract_Two year | -0.068806 | 0.017745 | -0.116205 | 0.247334 | 0.201699 | 0.163149 | 0.239499 | 0.251721 | 0.563801 | -0.016037 | -0.217542 | -0.081092 | 0.111391 | 0.165248 | 0.072124 | 0.075603 | 0.084362 | -0.155511 | -0.073256 | 0.358036 | 0.357385 | 0.388675 | -0.102756 | 0.106618 | -0.200608 | 0.217542 | -0.621933 | -0.288843 | 1.000000 |
In [13]:
### Getting the top 10 correlated pairs of columns
high_correlated_columns = high_correlation_matrix.unstack().drop_duplicates()
high_correlated_columns = high_correlated_columns[high_correlated_columns != 1.0].sort_values(ascending=False)
top_10_correlated_pairs = high_correlated_columns.head(10)
top_10_correlated_pairs_df = pd.DataFrame(top_10_correlated_pairs).reset_index()
top_10_correlated_pairs_df.columns = ['Column 1', 'Column 2', 'Correlation']
print("Top 10 correlated pairs of columns:")
display(top_10_correlated_pairs_df)
print('\n')
Top 10 correlated pairs of columns:
| Column 1 | Column 2 | Correlation | |
|---|---|---|---|
| 0 | Total Charges | Total Revenue | 0.972185 |
| 1 | Partner | Referred a Friend | 0.949904 |
| 2 | Tenure | Total Revenue | 0.852977 |
| 3 | Streaming Movies | Streaming Music | 0.848367 |
| 4 | Tenure | Total Charges | 0.825880 |
| 5 | Total Long Distance Charges | Total Revenue | 0.778407 |
| 6 | Monthly Charges | Internet Type_Fiber Optic | 0.774365 |
| 7 | Internet Service | Monthly Charges | 0.763191 |
| 8 | Internet Service | Unlimited Data | 0.755178 |
| 9 | Dependents | Number of Dependents | 0.723428 |
In [14]:
### Churn vs Numerical Features
plt.figure(figsize=(20, 25))
### Churn vs Age
plt.subplot(4, 3, 1)
sns.boxplot(x='Churn', y='Age', data=telco_churn_df)
plt.title('Churn vs Age')
### Churn vs Number of Dependents
plt.subplot(4, 3, 2)
sns.boxplot(x='Churn', y='Number of Dependents', data=telco_churn_df)
plt.title('Churn vs Number of Dependents')
### Churn vs Number of Referrals
plt.subplot(4, 3, 3)
sns.boxplot(x='Churn', y='Number of Referrals', data=telco_churn_df)
plt.title('Churn vs Number of Referrals')
### Churn vs Tenure
plt.subplot(4, 3, 4)
sns.boxplot(x='Churn', y='Tenure', data=telco_churn_df)
plt.title('Churn vs Tenure')
### Churn vs Avg Monthly Long Distance Charges
plt.subplot(4, 3, 5)
sns.boxplot(x='Churn', y='Avg Monthly Long Distance Charges', data=telco_churn_df)
plt.title('Churn vs Avg Monthly Long Distance Charges')
### Churn vs Avg Monthly GB Download
plt.subplot(4, 3, 6)
sns.boxplot(x='Churn', y='Avg Monthly GB Download', data=telco_churn_df)
plt.title('Churn vs Avg Monthly GB Download')
### Churn vs Monthly Charges
plt.subplot(4, 3, 7)
sns.boxplot(x='Churn', y='Monthly Charges', data=telco_churn_df)
plt.title('Churn vs Monthly Charges')
### Churn vs Total Charges
plt.subplot(4, 3, 8)
sns.boxplot(x='Churn', y='Total Charges', data=telco_churn_df)
plt.title('Churn vs Total Charges')
### Churn vs Total Refunds
plt.subplot(4, 3, 9)
sns.boxplot(x='Churn', y='Total Refunds', data=telco_churn_df)
plt.title('Churn vs Total Refunds')
### Churn vs Total Extra Data Charges
plt.subplot(4, 3, 10)
sns.boxplot(x='Churn', y='Total Extra Data Charges', data=telco_churn_df)
plt.title('Churn vs Total Extra Data Charges')
### Churn vs Total Long Distance Charges
plt.subplot(4, 3, 11)
sns.boxplot(x='Churn', y='Total Long Distance Charges', data=telco_churn_df)
plt.title('Churn vs Total Long Distance Charges')
### Churn vs Total Revenue
plt.subplot(4, 3, 12)
sns.boxplot(x='Churn', y='Total Revenue', data=telco_churn_df)
plt.title('Churn vs Total Revenue')
plt.tight_layout()
plt.show()
In [15]:
### Churn vs Categorical Features
plt.figure(figsize=(20, 35))
### Churn vs Gender
plt.subplot(7, 3, 1)
sns.countplot(x = 'Gender', hue = 'Churn', data = telco_churn_df)
plt.title('Churn vs Gender')
plt.xticks(rotation = 45, ha = 'right')
### Churn vs Under 30
plt.subplot(7, 3, 2)
sns.countplot(x = 'Under 30', hue = 'Churn', data = telco_churn_df)
plt.title('Churn vs Under 30')
plt.xticks(rotation = 45, ha = 'right')
### Churn vs Senior Citizen
plt.subplot(7, 3, 3)
sns.countplot(x = 'Senior Citizen', hue = 'Churn', data = telco_churn_df)
plt.title('Churn vs Senior Citizen')
plt.xticks(rotation = 45, ha = 'right')
### Churn vs Partner
plt.subplot(7, 3, 4)
sns.countplot(x = 'Partner', hue = 'Churn', data = telco_churn_df)
plt.title('Churn vs Partner')
plt.xticks(rotation = 45, ha = 'right')
### Churn vs Dependents
plt.subplot(7, 3, 5)
sns.countplot(x = 'Dependents', hue = 'Churn', data = telco_churn_df)
plt.title('Churn vs Dependents')
plt.xticks(rotation = 45, ha = 'right')
### Churn vs Referred a Friend
plt.subplot(7, 3, 6)
sns.countplot(x = 'Referred a Friend', hue = 'Churn', data = telco_churn_df)
plt.title('Churn vs Referred a Friend')
plt.xticks(rotation = 45, ha = 'right')
### Churn vs Phone Service
plt.subplot(7, 3, 7)
sns.countplot(x = 'Phone Service', hue = 'Churn', data = telco_churn_df)
plt.title('Churn vs Phone Service')
plt.xticks(rotation = 45, ha = 'right')
### Churn vs Multiple Lines
plt.subplot(7, 3, 8)
sns.countplot(x = 'Multiple Lines', hue = 'Churn', data = telco_churn_df)
plt.title('Churn vs Multiple Lines')
plt.xticks(rotation = 45, ha = 'right')
### Churn vs Internet Service
plt.subplot(7, 3, 9)
sns.countplot(x = 'Internet Service', hue = 'Churn', data = telco_churn_df)
plt.title('Churn vs Internet Service')
plt.xticks(rotation = 45, ha = 'right')
### Churn vs Internet Type
plt.subplot(7, 3, 10)
sns.countplot(x='Internet Type', hue='Churn', data=telco_churn_df)
plt.title('Churn vs Internet Type')
plt.xticks(rotation = 45, ha = 'right')
### Churn vs Online Security
plt.subplot(7, 3, 11)
sns.countplot(x = 'Online Security', hue = 'Churn', data = telco_churn_df)
plt.title('Churn vs Online Security')
plt.xticks(rotation = 45, ha = 'right')
### Churn vs Online Backup
plt.subplot(7, 3, 12)
sns.countplot(x = 'Online Backup', hue = 'Churn', data = telco_churn_df)
plt.title('Churn vs Online Backup')
plt.xticks(rotation = 45, ha = 'right')
### Churn vs Device Protection Plan
plt.subplot(7, 3, 13)
sns.countplot(x = 'Device Protection Plan', hue = 'Churn', data = telco_churn_df)
plt.title('Churn vs Device Protection Plan')
plt.xticks(rotation = 45, ha = 'right')
### Churn vs Premium Tech Support
plt.subplot(7, 3, 14)
sns.countplot(x = 'Premium Tech Support', hue = 'Churn', data = telco_churn_df)
plt.title('Churn vs Premium Tech Support')
plt.xticks(rotation = 45, ha = 'right')
### Churn vs Streaming TV
plt.subplot(7, 3, 15)
sns.countplot(x = 'Streaming TV', hue = 'Churn', data = telco_churn_df)
plt.title('Churn vs Streaming TV')
plt.xticks(rotation = 45, ha = 'right')
### Churn vs Streaming Movies
plt.subplot(7, 3, 16)
sns.countplot(x = 'Streaming Movies', hue = 'Churn', data = telco_churn_df)
plt.title('Churn vs Streaming Movies')
plt.xticks(rotation = 45, ha = 'right')
### Churn vs Streaming Music
plt.subplot(7, 3, 17)
sns.countplot(x = 'Streaming Music', hue = 'Churn', data = telco_churn_df)
plt.title('Churn vs Streaming Music')
plt.xticks(rotation = 45, ha = 'right')
### Churn vs Unlimited Data
plt.subplot(7, 3, 18)
sns.countplot(x = 'Unlimited Data', hue = 'Churn', data = telco_churn_df)
plt.title('Churn vs Unlimited Data')
plt.xticks(rotation = 45, ha = 'right')
### Churn vs Contract
plt.subplot(7, 3, 19)
sns.countplot(x = 'Contract', hue = 'Churn', data = telco_churn_df)
plt.title('Churn vs Contract')
plt.xticks(rotation = 45, ha = 'right')
### Churn vs Paperless Billing
plt.subplot(7, 3, 20)
sns.countplot(x = 'Paperless Billing', hue = 'Churn', data = telco_churn_df)
plt.title('Churn vs Paperless Billing')
plt.xticks(rotation = 45, ha = 'right')
### Churn vs Payment Method
plt.subplot(7, 3, 21)
sns.countplot(x = 'Payment Method', hue = 'Churn', data = telco_churn_df)
plt.title('Churn vs Payment Method')
plt.xticks(rotation = 45, ha = 'right')
plt.tight_layout()
plt.show()
In [16]:
### Defining the input feature matrix (X) and the target variable vector (y).
X = telco_churn_df_encoded_excluding_churn.copy()
y = telco_churn_df_encoded['Churn']
### Creating a list of the continuous features that we will be scaling, we do not want to scale the binary indicator columns only the continuous numerical columns.
continuous_columns = [
'Age',
'Number of Dependents',
'Number of Referrals',
'Tenure',
'Avg Monthly Long Distance Charges',
'Avg Monthly GB Download',
'Monthly Charges',
'Total Charges',
'Total Refunds',
'Total Extra Data Charges',
'Total Long Distance Charges',
'Total Revenue'
]
#############################################################################################################################################################################################
### Defining Sampling/Scaling techniques and the classification models we are going to be comparing.
classification_models = {
'logistic_regression': LogisticRegression(random_state = 22),
'support_vector_classifier': SVC(probability = True, random_state = 22),
'decision_tree_classifier': DecisionTreeClassifier(random_state = 22),
'random_forest_classifier': RandomForestClassifier(random_state = 22),
'gaussian_naive_bayes_classifier': GaussianNB(),
'xgboost_classifier': XGBClassifier(random_state = 22)
}
scaling_techniques = {
'no_scaling': None,
'standard_scaler': StandardScaler(),
'robust_scaler': RobustScaler(),
'min_max_scaler': MinMaxScaler()
}
sampling_techniques = {
'no_sampling': None,
'random_undersampling': RandomUnderSampler(random_state = 22),
'random_oversampling': RandomOverSampler(random_state = 22),
'smote': SMOTE(random_state = 22)
}
#############################################################################################################################################################################################
### Setting up KFold crossvalidation with 5 splits.
cross_validation_split = KFold(n_splits = 5, shuffle = True, random_state = 22)
### Creating containers to store average model metrics information and ROC curve information (false positive rate and true positive rate arrays).
average_model_metrics = []
roc_data = {}
### Looping through each combination of model, scaling technique, and sampling technique.
for classification_model_name, classification_model in classification_models.items():
for scaling_technique_name, scaling_technique in scaling_techniques.items():
for sampling_technique_name, sampling_technique in sampling_techniques.items():
### Creating containers to store model metric information for each cross validation fold.
accuracy_scores_per_fold = []
precision_scores_per_fold = []
recall_scores_per_fold = []
f1_scores_per_fold = []
log_loss_values_per_fold = []
roc_auc_scores_per_fold = []
### Creating containers to store true labels and predicted probabilities across all folds for ROC curve calculations.
all_true_labels = []
all_probabilities = []
### Setting up cross validation dataset splits
for train_indexes, test_indexes in cross_validation_split.split(X, y):
X_train = X.iloc[train_indexes].copy()
X_test = X.iloc[test_indexes].copy()
y_train = y.iloc[train_indexes].copy()
y_test = y.iloc[test_indexes].copy()
### Applying the scaling techniques to only the continuous features defined above.
if scaling_technique is not None:
X_train.loc[:, continuous_columns] = scaling_technique.fit_transform(X_train[continuous_columns])
X_test.loc[:, continuous_columns] = scaling_technique.transform(X_test[continuous_columns])
### Applying the sampling techniques.
if sampling_technique is not None:
X_train, y_train = sampling_technique.fit_resample(X_train, y_train)
### Training the classification model.
classification_model.fit(X_train, y_train)
### Generating predictions and probabilities of churn on the testing dataset.
y_predicted_labels = classification_model.predict(X_test)
y_predicted_probabilities = classification_model.predict_proba(X_test)[:, 1]
### Calculating performance metrics for each cross validation fold.
accuracy_scores_per_fold.append(accuracy_score(y_test, y_predicted_labels))
precision_scores_per_fold.append(precision_score(y_test, y_predicted_labels))
recall_scores_per_fold.append(recall_score(y_test, y_predicted_labels))
f1_scores_per_fold.append(f1_score(y_test, y_predicted_labels))
log_loss_values_per_fold.append(log_loss(y_test, y_predicted_probabilities))
roc_auc_scores_per_fold.append(roc_auc_score(y_test, y_predicted_probabilities))
### Collecting data for ROC curve
all_true_labels.extend(y_test.tolist())
all_probabilities.extend(y_predicted_probabilities.tolist())
### Computing a single ROC curve for each combination across all folds.
false_positive_rates, true_positive_rates, _ = roc_curve(all_true_labels, all_probabilities)
### Storing averaged metrics across all folds for each combination.
average_model_metrics.append({
'classification_model': classification_model_name,
'scaling_technique': scaling_technique_name,
'sampling_technique': sampling_technique_name,
'accuracy': np.mean(accuracy_scores_per_fold),
'precision': np.mean(precision_scores_per_fold),
'recall': np.mean(recall_scores_per_fold),
'f1_score': np.mean(f1_scores_per_fold),
'log_loss': np.mean(log_loss_values_per_fold),
'roc_auc': np.mean(roc_auc_scores_per_fold)
})
### Storing ROC curve information for plotting
roc_data[(classification_model_name, scaling_technique_name, sampling_technique_name)] = (false_positive_rates, true_positive_rates)
C:\Users\Adam\anaconda3\Lib\site-packages\sklearn\linear_model\_logistic.py:469: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
n_iter_i = _check_optimize_result(
C:\Users\Adam\anaconda3\Lib\site-packages\sklearn\linear_model\_logistic.py:469: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
n_iter_i = _check_optimize_result(
C:\Users\Adam\anaconda3\Lib\site-packages\sklearn\linear_model\_logistic.py:469: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
n_iter_i = _check_optimize_result(
C:\Users\Adam\anaconda3\Lib\site-packages\sklearn\linear_model\_logistic.py:469: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
n_iter_i = _check_optimize_result(
C:\Users\Adam\anaconda3\Lib\site-packages\sklearn\linear_model\_logistic.py:469: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
n_iter_i = _check_optimize_result(
C:\Users\Adam\anaconda3\Lib\site-packages\sklearn\linear_model\_logistic.py:469: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
n_iter_i = _check_optimize_result(
C:\Users\Adam\anaconda3\Lib\site-packages\sklearn\linear_model\_logistic.py:469: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
n_iter_i = _check_optimize_result(
C:\Users\Adam\anaconda3\Lib\site-packages\sklearn\linear_model\_logistic.py:469: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
n_iter_i = _check_optimize_result(
C:\Users\Adam\anaconda3\Lib\site-packages\sklearn\linear_model\_logistic.py:469: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
n_iter_i = _check_optimize_result(
C:\Users\Adam\anaconda3\Lib\site-packages\sklearn\linear_model\_logistic.py:469: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
n_iter_i = _check_optimize_result(
C:\Users\Adam\anaconda3\Lib\site-packages\sklearn\linear_model\_logistic.py:469: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
n_iter_i = _check_optimize_result(
C:\Users\Adam\anaconda3\Lib\site-packages\sklearn\linear_model\_logistic.py:469: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
n_iter_i = _check_optimize_result(
C:\Users\Adam\anaconda3\Lib\site-packages\sklearn\linear_model\_logistic.py:469: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
n_iter_i = _check_optimize_result(
C:\Users\Adam\anaconda3\Lib\site-packages\sklearn\linear_model\_logistic.py:469: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
n_iter_i = _check_optimize_result(
C:\Users\Adam\anaconda3\Lib\site-packages\sklearn\linear_model\_logistic.py:469: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
n_iter_i = _check_optimize_result(
C:\Users\Adam\anaconda3\Lib\site-packages\sklearn\linear_model\_logistic.py:469: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
n_iter_i = _check_optimize_result(
C:\Users\Adam\anaconda3\Lib\site-packages\sklearn\linear_model\_logistic.py:469: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
n_iter_i = _check_optimize_result(
C:\Users\Adam\anaconda3\Lib\site-packages\sklearn\linear_model\_logistic.py:469: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
n_iter_i = _check_optimize_result(
C:\Users\Adam\anaconda3\Lib\site-packages\sklearn\linear_model\_logistic.py:469: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
n_iter_i = _check_optimize_result(
C:\Users\Adam\anaconda3\Lib\site-packages\sklearn\linear_model\_logistic.py:469: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
n_iter_i = _check_optimize_result(
C:\Users\Adam\anaconda3\Lib\site-packages\sklearn\linear_model\_logistic.py:469: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
n_iter_i = _check_optimize_result(
C:\Users\Adam\anaconda3\Lib\site-packages\sklearn\linear_model\_logistic.py:469: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
n_iter_i = _check_optimize_result(
C:\Users\Adam\anaconda3\Lib\site-packages\sklearn\linear_model\_logistic.py:469: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
n_iter_i = _check_optimize_result(
C:\Users\Adam\anaconda3\Lib\site-packages\sklearn\linear_model\_logistic.py:469: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
n_iter_i = _check_optimize_result(
C:\Users\Adam\anaconda3\Lib\site-packages\sklearn\linear_model\_logistic.py:469: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
n_iter_i = _check_optimize_result(
C:\Users\Adam\anaconda3\Lib\site-packages\sklearn\linear_model\_logistic.py:469: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
n_iter_i = _check_optimize_result(
C:\Users\Adam\anaconda3\Lib\site-packages\sklearn\linear_model\_logistic.py:469: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
n_iter_i = _check_optimize_result(
C:\Users\Adam\anaconda3\Lib\site-packages\sklearn\linear_model\_logistic.py:469: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
n_iter_i = _check_optimize_result(
C:\Users\Adam\anaconda3\Lib\site-packages\sklearn\linear_model\_logistic.py:469: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
n_iter_i = _check_optimize_result(
C:\Users\Adam\anaconda3\Lib\site-packages\sklearn\linear_model\_logistic.py:469: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
n_iter_i = _check_optimize_result(
C:\Users\Adam\anaconda3\Lib\site-packages\sklearn\linear_model\_logistic.py:469: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
n_iter_i = _check_optimize_result(
C:\Users\Adam\anaconda3\Lib\site-packages\sklearn\linear_model\_logistic.py:469: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
n_iter_i = _check_optimize_result(
C:\Users\Adam\anaconda3\Lib\site-packages\sklearn\linear_model\_logistic.py:469: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
n_iter_i = _check_optimize_result(
C:\Users\Adam\anaconda3\Lib\site-packages\sklearn\linear_model\_logistic.py:469: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
n_iter_i = _check_optimize_result(
C:\Users\Adam\anaconda3\Lib\site-packages\sklearn\linear_model\_logistic.py:469: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
n_iter_i = _check_optimize_result(
C:\Users\Adam\anaconda3\Lib\site-packages\sklearn\linear_model\_logistic.py:469: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
n_iter_i = _check_optimize_result(
C:\Users\Adam\anaconda3\Lib\site-packages\sklearn\linear_model\_logistic.py:469: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
n_iter_i = _check_optimize_result(
C:\Users\Adam\anaconda3\Lib\site-packages\sklearn\linear_model\_logistic.py:469: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
n_iter_i = _check_optimize_result(
C:\Users\Adam\anaconda3\Lib\site-packages\sklearn\linear_model\_logistic.py:469: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
n_iter_i = _check_optimize_result(
C:\Users\Adam\anaconda3\Lib\site-packages\sklearn\linear_model\_logistic.py:469: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
n_iter_i = _check_optimize_result(
C:\Users\Adam\anaconda3\Lib\site-packages\sklearn\metrics\_classification.py:1531: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.
_warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
C:\Users\Adam\anaconda3\Lib\site-packages\sklearn\metrics\_classification.py:1531: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.
_warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
C:\Users\Adam\anaconda3\Lib\site-packages\sklearn\metrics\_classification.py:1531: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.
_warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
C:\Users\Adam\anaconda3\Lib\site-packages\sklearn\metrics\_classification.py:1531: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.
_warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
C:\Users\Adam\anaconda3\Lib\site-packages\sklearn\metrics\_classification.py:1531: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.
_warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
In [17]:
### Converting the results from the cross validations into a pandas dataframe.
results_df = pd.DataFrame(average_model_metrics)
results_df = results_df
print('results_df:')
display(results_df)
print('\n')
print('\n')
#############################################################################################################################################################################################
### Creating Top 10 performing dataframes by F1, ROC_AUC, and Recall Scores.
top_10_combinations_by_f1_score = results_df.sort_values(by = 'f1_score', ascending = False).head(10)
print('top_10_combinations_by_f1_score:')
display(top_10_combinations_by_f1_score)
print('\n')
print('\n')
top_10_combinations_by_roc_auc = results_df.sort_values(by = 'roc_auc', ascending = False).head(10)
print('top_10_combinations_by_roc_auc:')
display(top_10_combinations_by_roc_auc)
print('\n')
print('\n')
top_10_combinations_by_recall = results_df.sort_values(by = 'recall', ascending = False).head(10)
print('top_10_combinations_by_recall:')
display(top_10_combinations_by_recall)
print('\n')
print('\n')
#############################################################################################################################################################################################
### Creating a DF for the best combination of scaling/sampling techniques by classification model.
best_combination_by_classification_model_f1 = results_df.loc[results_df.groupby('classification_model')['f1_score'].idxmax()]
best_combination_by_classification_model_f1 = best_combination_by_classification_model_f1.sort_values(by = 'f1_score', ascending = False)
print('best_combination_by_classification_model_f1:')
display(best_combination_by_classification_model_f1)
print('\n')
print('\n')
best_combination_by_classification_model_roc = results_df.loc[results_df.groupby('classification_model')['roc_auc'].idxmax()]
best_combination_by_classification_model_roc = best_combination_by_classification_model_roc.sort_values(by = 'roc_auc', ascending = False)
print('best_combination_by_classification_model_roc:')
display(best_combination_by_classification_model_roc)
print('\n')
print('\n')
best_combination_by_classification_model_recall = results_df.loc[results_df.groupby('classification_model')['recall'].idxmax()]
best_combination_by_classification_model_recall = best_combination_by_classification_model_recall.sort_values(by = 'recall', ascending = False)
print('best_combination_by_classification_model_recall:')
display(best_combination_by_classification_model_recall)
print('\n')
print('\n')
results_df:
| classification_model | scaling_technique | sampling_technique | accuracy | precision | recall | f1_score | log_loss | roc_auc | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | logistic_regression | no_scaling | no_sampling | 0.782420 | 0.618056 | 0.473391 | 0.535902 | 0.451874 | 0.812681 |
| 1 | logistic_regression | no_scaling | random_undersampling | 0.721556 | 0.485889 | 0.797014 | 0.603352 | 0.519924 | 0.827531 |
| 2 | logistic_regression | no_scaling | random_oversampling | 0.719140 | 0.482577 | 0.806369 | 0.603679 | 0.522623 | 0.827346 |
| 3 | logistic_regression | no_scaling | smote | 0.726960 | 0.491325 | 0.796354 | 0.607425 | 0.520543 | 0.826521 |
| 4 | logistic_regression | standard_scaler | no_sampling | 0.827218 | 0.690538 | 0.634850 | 0.661133 | 0.366044 | 0.885327 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 91 | xgboost_classifier | robust_scaler | smote | 0.822383 | 0.667102 | 0.662604 | 0.664681 | 0.392401 | 0.886466 |
| 92 | xgboost_classifier | min_max_scaler | no_sampling | 0.826080 | 0.697425 | 0.613488 | 0.652006 | 0.390723 | 0.887037 |
| 93 | xgboost_classifier | min_max_scaler | random_undersampling | 0.791098 | 0.575542 | 0.815427 | 0.674655 | 0.485473 | 0.883479 |
| 94 | xgboost_classifier | min_max_scaler | random_oversampling | 0.817974 | 0.642409 | 0.711859 | 0.675115 | 0.406087 | 0.886566 |
| 95 | xgboost_classifier | min_max_scaler | smote | 0.824232 | 0.669644 | 0.669053 | 0.669067 | 0.391545 | 0.887292 |
96 rows × 9 columns
top_10_combinations_by_f1_score:
| classification_model | scaling_technique | sampling_technique | accuracy | precision | recall | f1_score | log_loss | roc_auc | |
|---|---|---|---|---|---|---|---|---|---|
| 90 | xgboost_classifier | robust_scaler | random_oversampling | 0.817974 | 0.642409 | 0.711859 | 0.675115 | 0.406087 | 0.886566 |
| 94 | xgboost_classifier | min_max_scaler | random_oversampling | 0.817974 | 0.642409 | 0.711859 | 0.675115 | 0.406087 | 0.886566 |
| 82 | xgboost_classifier | no_scaling | random_oversampling | 0.817974 | 0.642409 | 0.711859 | 0.675115 | 0.406087 | 0.886566 |
| 86 | xgboost_classifier | standard_scaler | random_oversampling | 0.817974 | 0.642409 | 0.711859 | 0.675115 | 0.406087 | 0.886566 |
| 85 | xgboost_classifier | standard_scaler | random_undersampling | 0.791098 | 0.575542 | 0.815427 | 0.674655 | 0.485473 | 0.883479 |
| 93 | xgboost_classifier | min_max_scaler | random_undersampling | 0.791098 | 0.575542 | 0.815427 | 0.674655 | 0.485473 | 0.883479 |
| 81 | xgboost_classifier | no_scaling | random_undersampling | 0.791098 | 0.575542 | 0.815427 | 0.674655 | 0.485473 | 0.883479 |
| 89 | xgboost_classifier | robust_scaler | random_undersampling | 0.791098 | 0.575542 | 0.815427 | 0.674655 | 0.485473 | 0.883479 |
| 53 | random_forest_classifier | standard_scaler | random_undersampling | 0.787257 | 0.568583 | 0.826544 | 0.673595 | 0.458929 | 0.875245 |
| 22 | support_vector_classifier | standard_scaler | random_oversampling | 0.787684 | 0.569760 | 0.824689 | 0.673592 | 0.422451 | 0.880030 |
top_10_combinations_by_roc_auc:
| classification_model | scaling_technique | sampling_technique | accuracy | precision | recall | f1_score | log_loss | roc_auc | |
|---|---|---|---|---|---|---|---|---|---|
| 87 | xgboost_classifier | standard_scaler | smote | 0.829492 | 0.686490 | 0.661427 | 0.673444 | 0.385130 | 0.890087 |
| 95 | xgboost_classifier | min_max_scaler | smote | 0.824232 | 0.669644 | 0.669053 | 0.669067 | 0.391545 | 0.887292 |
| 88 | xgboost_classifier | robust_scaler | no_sampling | 0.826080 | 0.697425 | 0.613488 | 0.652006 | 0.390723 | 0.887037 |
| 92 | xgboost_classifier | min_max_scaler | no_sampling | 0.826080 | 0.697425 | 0.613488 | 0.652006 | 0.390723 | 0.887037 |
| 84 | xgboost_classifier | standard_scaler | no_sampling | 0.826080 | 0.697425 | 0.613488 | 0.652006 | 0.390723 | 0.887037 |
| 80 | xgboost_classifier | no_scaling | no_sampling | 0.826080 | 0.697425 | 0.613488 | 0.652006 | 0.390723 | 0.887037 |
| 83 | xgboost_classifier | no_scaling | smote | 0.827787 | 0.694199 | 0.629963 | 0.660297 | 0.388941 | 0.886955 |
| 90 | xgboost_classifier | robust_scaler | random_oversampling | 0.817974 | 0.642409 | 0.711859 | 0.675115 | 0.406087 | 0.886566 |
| 86 | xgboost_classifier | standard_scaler | random_oversampling | 0.817974 | 0.642409 | 0.711859 | 0.675115 | 0.406087 | 0.886566 |
| 94 | xgboost_classifier | min_max_scaler | random_oversampling | 0.817974 | 0.642409 | 0.711859 | 0.675115 | 0.406087 | 0.886566 |
top_10_combinations_by_recall:
| classification_model | scaling_technique | sampling_technique | accuracy | precision | recall | f1_score | log_loss | roc_auc | |
|---|---|---|---|---|---|---|---|---|---|
| 21 | support_vector_classifier | standard_scaler | random_undersampling | 0.777586 | 0.554109 | 0.841799 | 0.667910 | 0.443162 | 0.879597 |
| 14 | logistic_regression | min_max_scaler | random_oversampling | 0.779293 | 0.556631 | 0.834832 | 0.667701 | 0.433739 | 0.882812 |
| 6 | logistic_regression | standard_scaler | random_oversampling | 0.780573 | 0.558455 | 0.834342 | 0.668893 | 0.431611 | 0.883752 |
| 10 | logistic_regression | robust_scaler | random_oversampling | 0.778156 | 0.555041 | 0.834289 | 0.666389 | 0.434312 | 0.882462 |
| 9 | logistic_regression | robust_scaler | random_undersampling | 0.779151 | 0.556693 | 0.834230 | 0.667397 | 0.435737 | 0.881951 |
| 5 | logistic_regression | standard_scaler | random_undersampling | 0.778298 | 0.555471 | 0.832649 | 0.666092 | 0.432980 | 0.883545 |
| 29 | support_vector_classifier | min_max_scaler | random_undersampling | 0.766353 | 0.539656 | 0.829936 | 0.653644 | 0.467666 | 0.863999 |
| 13 | logistic_regression | min_max_scaler | random_undersampling | 0.777730 | 0.554925 | 0.828934 | 0.664546 | 0.438325 | 0.880823 |
| 53 | random_forest_classifier | standard_scaler | random_undersampling | 0.787257 | 0.568583 | 0.826544 | 0.673595 | 0.458929 | 0.875245 |
| 57 | random_forest_classifier | robust_scaler | random_undersampling | 0.786688 | 0.567830 | 0.825442 | 0.672703 | 0.459067 | 0.875099 |
best_combination_by_classification_model_f1:
| classification_model | scaling_technique | sampling_technique | accuracy | precision | recall | f1_score | log_loss | roc_auc | |
|---|---|---|---|---|---|---|---|---|---|
| 82 | xgboost_classifier | no_scaling | random_oversampling | 0.817974 | 0.642409 | 0.711859 | 0.675115 | 0.406087 | 0.886566 |
| 53 | random_forest_classifier | standard_scaler | random_undersampling | 0.787257 | 0.568583 | 0.826544 | 0.673595 | 0.458929 | 0.875245 |
| 22 | support_vector_classifier | standard_scaler | random_oversampling | 0.787684 | 0.569760 | 0.824689 | 0.673592 | 0.422451 | 0.880030 |
| 6 | logistic_regression | standard_scaler | random_oversampling | 0.780573 | 0.558455 | 0.834342 | 0.668893 | 0.431611 | 0.883752 |
| 71 | gaussian_naive_bayes_classifier | standard_scaler | smote | 0.769764 | 0.547335 | 0.780207 | 0.642949 | 2.185817 | 0.849863 |
| 37 | decision_tree_classifier | standard_scaler | random_undersampling | 0.730519 | 0.494910 | 0.725554 | 0.588345 | 9.713094 | 0.728828 |
best_combination_by_classification_model_roc:
| classification_model | scaling_technique | sampling_technique | accuracy | precision | recall | f1_score | log_loss | roc_auc | |
|---|---|---|---|---|---|---|---|---|---|
| 87 | xgboost_classifier | standard_scaler | smote | 0.829492 | 0.686490 | 0.661427 | 0.673444 | 0.385130 | 0.890087 |
| 4 | logistic_regression | standard_scaler | no_sampling | 0.827218 | 0.690538 | 0.634850 | 0.661133 | 0.366044 | 0.885327 |
| 22 | support_vector_classifier | standard_scaler | random_oversampling | 0.787684 | 0.569760 | 0.824689 | 0.673592 | 0.422451 | 0.880030 |
| 55 | random_forest_classifier | standard_scaler | smote | 0.822099 | 0.670228 | 0.650816 | 0.659916 | 0.416036 | 0.879037 |
| 64 | gaussian_naive_bayes_classifier | no_scaling | no_sampling | 0.766352 | 0.541913 | 0.786551 | 0.641472 | 1.763892 | 0.850590 |
| 37 | decision_tree_classifier | standard_scaler | random_undersampling | 0.730519 | 0.494910 | 0.725554 | 0.588345 | 9.713094 | 0.728828 |
best_combination_by_classification_model_recall:
| classification_model | scaling_technique | sampling_technique | accuracy | precision | recall | f1_score | log_loss | roc_auc | |
|---|---|---|---|---|---|---|---|---|---|
| 21 | support_vector_classifier | standard_scaler | random_undersampling | 0.777586 | 0.554109 | 0.841799 | 0.667910 | 0.443162 | 0.879597 |
| 14 | logistic_regression | min_max_scaler | random_oversampling | 0.779293 | 0.556631 | 0.834832 | 0.667701 | 0.433739 | 0.882812 |
| 53 | random_forest_classifier | standard_scaler | random_undersampling | 0.787257 | 0.568583 | 0.826544 | 0.673595 | 0.458929 | 0.875245 |
| 81 | xgboost_classifier | no_scaling | random_undersampling | 0.791098 | 0.575542 | 0.815427 | 0.674655 | 0.485473 | 0.883479 |
| 70 | gaussian_naive_bayes_classifier | standard_scaler | random_oversampling | 0.756967 | 0.528309 | 0.802096 | 0.636806 | 2.009347 | 0.849655 |
| 41 | decision_tree_classifier | robust_scaler | random_undersampling | 0.730092 | 0.494400 | 0.726121 | 0.588180 | 9.728468 | 0.728730 |
In [18]:
### Plotting ROC curves for each type of classification model.
### Getting the unique model names from the results dataframe.
unique_model_names = results_df['classification_model'].unique()
### Looping through each model type and creating a ROC curve for each type containing all combinations of scaling and sampling techniques.
for model_name in unique_model_names:
plt.figure(figsize = (10, 8))
for (model_key, scaler_key, sampler_key), (false_positive_rate, true_positive_rate) in roc_data.items():
if model_key == model_name:
combination_label = f"{model_key} | {scaler_key} | {sampler_key}"
sns.lineplot(x = false_positive_rate, y = true_positive_rate, label = combination_label)
plt.plot([0, 1], [0, 1], 'k--')
plt.title(f"ROC Curves for {model_name}")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.legend(loc = "lower right", fontsize = 'small')
plt.tight_layout()
plt.show()
In [19]:
### Plotting ROC curves for the best scaling/sampling technique combinations for each type of classification model.
### Looping through each best scaling/sampling technique combinations and creating a ROC curve.
for index, row in best_combination_by_classification_model_f1.iterrows():
model_name = row['classification_model']
scaler_name = row['scaling_technique']
sampler_name = row['sampling_technique']
combination_key = (model_name, scaler_name, sampler_name)
if combination_key in roc_data:
false_positive_rate, true_positive_rate = roc_data[combination_key]
plt.figure(figsize = (10, 8))
combination_label = f"{model_name} | {scaler_name} | {sampler_name}"
sns.lineplot(x = false_positive_rate, y = true_positive_rate, label = combination_label)
plt.plot([0, 1], [0, 1], 'k--')
plt.title(f"ROC Curve of Best Scaling/Sampling Combination for {model_name} Based on F1 Score")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.legend(
loc = "lower right", fontsize = 'small')
plt.tight_layout()
plt.show()
In [20]:
### Hyper Parameter tuning for the best combination of scaling and sampling techniques for each model.
### Defining the hyper parameters for each model.
hyperparameter_grids = {
'logistic_regression': {
'C': [0.1, 1.0, 10], ### Regularization strength: The smaller the value the stronger the regularization. [DEFAULT: 1.0]
'penalty': ['l2'], ### Type of regularization applied to the model. L1 = Lasso, L2 = Ridge; we are not using L1 because it is not compatible with LBFGS. [DEFAULT: L2]
'solver': ['liblinear', 'lbfgs'], ### Algorithm used to optimize the parameters. [DEFAULT: LBFGS]
'class_weight': [None, 'balanced'] ### Class Weighting: Balanced automatically adjusts the weights inversely proportional to class frequencies [DEFAULT: NONE]
}
,
'support_vector_classifier': {
'C': [0.1, 1.0, 10], ### Regularization strength: The smaller the value the stronger the regularization. [DEFAULT: 1.0]
'kernel': ['linear', 'rbf', 'sigmoid'], ### Kernel type used in the algorithm. [DEFAULT: RBF]
'gamma': ['scale', 'auto', 0.1, 1.0], ### Kernel coefficient. [DEFAULT: SCALE]
'class_weight': [None, 'balanced'] ### Class Weighting: Balanced automatically adjusts the weights inversely proportional to class frequencies [DEFAULT: NONE]
},
'decision_tree_classifier': {
'criterion': ['gini', 'entropy', 'log_loss'], ### The function used to measure the quality of a split. [DEFAULT: GINI]
'max_depth': [None, 5, 10, 15], ### Max depth of the tree. [DEFAULT: NONE]
'min_samples_split': [2, 10, 20], ### The minimum number of samples required to split an internal node. [DEFAULT: 2]
'class_weight': [None, 'balanced'] ### Class Weighting: Balanced automatically adjusts the weights inversely proportional to class frequencies [DEFAULT: NONE]
},
'random_forest_classifier': {
'n_estimators': [100, 200, 300], ### Number of trees in the forest. [DEFAULT: 100]
'criterion': ['gini', 'entropy', 'log_loss'], ### The function used to measure the quality of a split. [DEFAULT: GINI]
'max_depth': [None, 5, 10, 15], ### Max depth of the tree. [DEFAULT: NONE]
'min_samples_split': [2, 10, 20], ### The minimum number of samples required to split an internal node. [DEFAULT: 2]
'max_features': [None, 'sqrt', 'log2'], ### The number of features to consider when looking for the best split. [DEFAULT: SQRT]
'class_weight': [None, 'balanced'] ### Class Weighting: Balanced automatically adjusts the weights inversely proportional to class frequencies [DEFAULT: NONE]
},
'gaussian_naive_bayes_classifier': {
'var_smoothing': [1e-9, 1e-7, 1e-5] ### Artificially adding a value to the variance of each feature, widening the distribution and accounting for more samples further from the mean. [DEFAULT: 1e-09]
},
'xgboost_classifier': {
'n_estimators': [100, 200, 300], ### Number of boosting rounds. [DEFAULT: 100]
'learning_rate': [0.1, 0.2, 0.3], ### Step size shrinkage [DEFAULT: 0.3]
'max_depth': [3, 6], ### Max depth of the trees. [DEFAULT: 6]
'subsample': [0.8, 1.0], ### The fraction of samples to be randomly sampled to build each tree. [DEFAULT: 1.0]
'colsample_bytree': [0.8, 1.0] ### The fraction of features to be randomly sampled for building each tree. [DEFAULT: 1.0]
}
}
#############################################################################################################################################################################################
### Defining additional scoring metrics for Gridsearch evaluation.
scoring_metrics = {
'accuracy': 'accuracy',
'precision': 'precision',
'recall': 'recall',
'f1': 'f1',
'roc_auc': 'roc_auc'
}
#############################################################################################################################################################################################
### Setting up KFold crossvalidation with 5 splits.
cross_validation_strategy = KFold(n_splits = 5, shuffle = True, random_state = 22)
### Creating a container to store the results of each gridsearch.
gridsearch_results_list = []
### Looping through the best combination of scaling and sampling techniques for each model based on F1 score.
for row_index, configuration_combination in best_combination_by_classification_model_f1.iterrows():
classification_model_name = configuration_combination['classification_model']
scaling_technique_name = configuration_combination['scaling_technique']
sampling_technique_name = configuration_combination['sampling_technique']
### Getting the model, scaling/sampling techniques, and hyperparameter grids.
classification_model = classification_models[classification_model_name]
scaling_technique = scaling_techniques[scaling_technique_name]
sampling_technique = sampling_techniques[sampling_technique_name]
hyperparameter_grid = hyperparameter_grids.get(classification_model_name, {})
### Defining the input feature matrix (X) and the target variable vector (y).
X_2 = X.copy()
y_2 = y.copy()
### Applying the optimal scaling techniques for each model.
if scaling_technique is not None:
X_2[continuous_columns] = scaling_technique.fit_transform(X_2[continuous_columns])
### Applying the optimal sampling techniques for each model.
if sampling_technique is not None:
X_2, y_2 = sampling_technique.fit_resample(X_2, y_2)
### Setting up GridSearchCV to find the optimal combination of hyperparameters for each optimal combination of scaling/sampling techniques for each model.
### Scoring based on ROC_AUC
grid_search = GridSearchCV(
estimator = classification_model,
param_grid = hyperparameter_grid,
scoring = scoring_metrics,
refit = 'f1',
cv = cross_validation_strategy)
grid_search.fit(X_2, y_2)
### Storing the best hyperparameters for each model from the gridsearch results.
gridsearch_results_list.append({
'classification_model': classification_model_name,
'scaling_technique': scaling_technique_name,
'sampling_technique': sampling_technique_name,
'best_hyperparameters': grid_search.best_params_,
'accuracy': grid_search.cv_results_['mean_test_accuracy'][grid_search.best_index_],
'precision': grid_search.cv_results_['mean_test_precision'][grid_search.best_index_],
'recall': grid_search.cv_results_['mean_test_recall'][grid_search.best_index_],
'f1_score': grid_search.best_score_,
'roc_auc': grid_search.cv_results_['mean_test_roc_auc'][grid_search.best_index_]
})
In [21]:
print('Optimized Hyper Parameters Selected Based on The Best Average F1 Score.')
display(gridsearch_results_list)
print('\n')
Optimized Hyper Parameters Selected Based on The Best Average F1 Score.
[{'classification_model': 'xgboost_classifier',
'scaling_technique': 'no_scaling',
'sampling_technique': 'random_oversampling',
'best_hyperparameters': {'colsample_bytree': 0.8,
'learning_rate': 0.2,
'max_depth': 6,
'n_estimators': 300,
'subsample': 0.8},
'accuracy': 0.9142941994097917,
'precision': 0.8745515273818633,
'recall': 0.967229663433797,
'f1_score': 0.9185548531948735,
'roc_auc': 0.9710510634326217},
{'classification_model': 'random_forest_classifier',
'scaling_technique': 'standard_scaler',
'sampling_technique': 'random_undersampling',
'best_hyperparameters': {'class_weight': None,
'criterion': 'entropy',
'max_depth': 10,
'max_features': None,
'min_samples_split': 2,
'n_estimators': 300},
'accuracy': 0.7969560953260457,
'precision': 0.7670849913568768,
'recall': 0.8541957861050818,
'f1_score': 0.8079683877935653,
'roc_auc': 0.8784744131975708},
{'classification_model': 'support_vector_classifier',
'scaling_technique': 'standard_scaler',
'sampling_technique': 'random_oversampling',
'best_hyperparameters': {'C': 10,
'class_weight': None,
'gamma': 1.0,
'kernel': 'rbf'},
'accuracy': 0.9330815298538073,
'precision': 0.9600035451334803,
'recall': 0.9038578112194156,
'f1_score': 0.9310570315354507,
'roc_auc': 0.9675253606363707},
{'classification_model': 'logistic_regression',
'scaling_technique': 'standard_scaler',
'sampling_technique': 'random_oversampling',
'best_hyperparameters': {'C': 1.0,
'class_weight': 'balanced',
'penalty': 'l2',
'solver': 'lbfgs'},
'accuracy': 0.7996324675537761,
'precision': 0.7812628625514082,
'recall': 0.8321832987241178,
'f1_score': 0.8059039079205892,
'roc_auc': 0.887956007215751},
{'classification_model': 'gaussian_naive_bayes_classifier',
'scaling_technique': 'standard_scaler',
'sampling_technique': 'smote',
'best_hyperparameters': {'var_smoothing': 1e-09},
'accuracy': 0.7872356543976148,
'precision': 0.7758240583947038,
'recall': 0.8082660804470712,
'f1_score': 0.7916670256313145,
'roc_auc': 0.8697001186508084},
{'classification_model': 'decision_tree_classifier',
'scaling_technique': 'standard_scaler',
'sampling_technique': 'random_undersampling',
'best_hyperparameters': {'class_weight': None,
'criterion': 'gini',
'max_depth': 5,
'min_samples_split': 2},
'accuracy': 0.7760822255152517,
'precision': 0.7449526453706763,
'recall': 0.8425258526501521,
'f1_score': 0.7898957571367448,
'roc_auc': 0.8483105914632645}]
In [22]:
### Converting the results into a pandas dataframe.
gridsearch_results_dataframe = pd.DataFrame(gridsearch_results_list).sort_values(by = 'f1_score', ascending = False)
print('gridsearch_results_dataframe:')
print('Optimized Hyper Parameters Selected Based on The Best Average F1 Score:')
display(gridsearch_results_dataframe)
print('\n')
gridsearch_results_dataframe: Optimized Hyper Parameters Selected Based on The Best Average F1 Score:
| classification_model | scaling_technique | sampling_technique | best_hyperparameters | accuracy | precision | recall | f1_score | roc_auc | |
|---|---|---|---|---|---|---|---|---|---|
| 2 | support_vector_classifier | standard_scaler | random_oversampling | {'C': 10, 'class_weight': None, 'gamma': 1.0, ... | 0.933082 | 0.960004 | 0.903858 | 0.931057 | 0.967525 |
| 0 | xgboost_classifier | no_scaling | random_oversampling | {'colsample_bytree': 0.8, 'learning_rate': 0.2... | 0.914294 | 0.874552 | 0.967230 | 0.918555 | 0.971051 |
| 1 | random_forest_classifier | standard_scaler | random_undersampling | {'class_weight': None, 'criterion': 'entropy',... | 0.796956 | 0.767085 | 0.854196 | 0.807968 | 0.878474 |
| 3 | logistic_regression | standard_scaler | random_oversampling | {'C': 1.0, 'class_weight': 'balanced', 'penalt... | 0.799632 | 0.781263 | 0.832183 | 0.805904 | 0.887956 |
| 4 | gaussian_naive_bayes_classifier | standard_scaler | smote | {'var_smoothing': 1e-09} | 0.787236 | 0.775824 | 0.808266 | 0.791667 | 0.869700 |
| 5 | decision_tree_classifier | standard_scaler | random_undersampling | {'class_weight': None, 'criterion': 'gini', 'm... | 0.776082 | 0.744953 | 0.842526 | 0.789896 | 0.848311 |
In [23]:
### Plotting the performance metrics from the models after optimizing the hyperparameters
plt.figure(figsize = (20, 15))
### Plotting Accuracy Performance
plt.subplot(2, 3, 1)
sns.barplot(x = 'classification_model', y = 'accuracy', data = gridsearch_results_dataframe.sort_values(by = 'accuracy', ascending = False))
plt.title('Model Accuracy')
plt.xticks(rotation = 45, ha = 'right')
### Plotting Precision Performance
plt.subplot(2, 3, 2)
sns.barplot(x = 'classification_model', y = 'precision', data = gridsearch_results_dataframe.sort_values(by = 'precision', ascending = False))
plt.title('Model Precision')
plt.xticks(rotation = 45, ha = 'right')
### Plotting Recall Performance
plt.subplot(2, 3, 3)
sns.barplot(x = 'classification_model', y = 'recall', data = gridsearch_results_dataframe.sort_values(by = 'recall', ascending = False))
plt.title('Model Recall')
plt.xticks(rotation = 45, ha = 'right')
### Plotting F1 Score Performance
plt.subplot(2, 3, 4)
sns.barplot(x = 'classification_model', y = 'f1_score', data = gridsearch_results_dataframe.sort_values(by = 'f1_score', ascending = False))
plt.title('Model F1 Score')
plt.xticks(rotation = 45, ha = 'right')
### Plotting ROC AUC Performance
plt.subplot(2, 3, 5)
sns.barplot(x = 'classification_model', y = 'roc_auc', data = gridsearch_results_dataframe.sort_values(by = 'roc_auc', ascending = False))
plt.title('Model ROC AUC')
plt.xticks(rotation = 45, ha = 'right')
plt.tight_layout()
plt.show()
In [24]:
### SHAP Feature Importances
### Extract the optimal XGBoost configuration from gridsearch_results_list
xgb_config = next(cfg for cfg in gridsearch_results_list if cfg['classification_model'] == 'xgboost_classifier')
### Initializing the XGBoost model with the optimized hyperparameters.
xgb_model = XGBClassifier(random_state= 22)
xgb_model.set_params(**xgb_config['best_hyperparameters'])
### Defining the input feature matrix (X) and the target variable vector (y).
X_SHAP = X.copy()
y_SHAP = y.copy()
### Applying the optimal scaling techniques.
if scaling_technique is not None:
X_SHAP[continuous_columns] = scaling_technique.fit_transform(X_SHAP[continuous_columns])
### Applying the optimal sampling techniques.
if sampling_technique is not None:
X_SHAP, y_SHAP = sampling_technique.fit_resample(X_SHAP, y_SHAP)
### Fitting the model.
xgb_model.fit(X_SHAP, y_SHAP)
### SHAP analysis
explainer = shap.TreeExplainer(xgb_model)
shap_values = explainer.shap_values(X_SHAP)
### Creating a feature importance dataframe.
shap_df = pd.DataFrame({
'Feature': X_SHAP.columns,
'Mean_SHAP_value': shap_values.mean(axis = 0),
'Mean_ABS_SHAP_value': np.abs(shap_values).mean(axis = 0)
}).sort_values('Mean_ABS_SHAP_value', ascending = False)
### Plotting the top 10 Mean SHAP Feature Contributions sorted by mean absolute shap values
plt.figure(figsize = (10, 6))
sns.barplot(data = shap_df.head(10), x = 'Mean_SHAP_value', y = 'Feature')
plt.title('Top 10 Mean SHAP Feature Contributions (via XGBoost)')
plt.xlabel('Mean SHAP Value (Direction + Magnitude)')
plt.ylabel('Features')
plt.axvline(0, color = 'black', linestyle = '--')
plt.tight_layout()
plt.show()
print('\n')
### Displaying the feature importance dataframe.
print('shap_df:')
display(shap_df)
print('\n')
shap_df:
| Feature | Mean_SHAP_value | Mean_ABS_SHAP_value | |
|---|---|---|---|
| 7 | Number of Referrals | -0.363347 | 1.006999 |
| 37 | Contract_Month-to-month | -0.010587 | 0.986154 |
| 22 | Monthly Charges | -0.088123 | 0.771502 |
| 5 | Number of Dependents | -0.312016 | 0.670610 |
| 8 | Tenure | -0.060070 | 0.464272 |
| 4 | Dependents | 0.271081 | 0.432241 |
| 0 | Age | 0.080936 | 0.421825 |
| 23 | Total Charges | 0.025300 | 0.420651 |
| 10 | Avg Monthly Long Distance Charges | 0.059906 | 0.415025 |
| 27 | Total Revenue | -0.114075 | 0.414265 |
| 6 | Referred a Friend | 0.087265 | 0.354522 |
| 26 | Total Long Distance Charges | 0.012631 | 0.334532 |
| 12 | Avg Monthly GB Download | 0.035204 | 0.308798 |
| 13 | Online Security | -0.003409 | 0.272166 |
| 16 | Premium Tech Support | 0.001921 | 0.265257 |
| 42 | Payment Method_Electronic check | -0.006331 | 0.259798 |
| 21 | Paperless Billing | 0.005460 | 0.241869 |
| 39 | Contract_Two year | -0.035785 | 0.188249 |
| 17 | Streaming TV | 0.018091 | 0.176115 |
| 19 | Streaming Music | 0.005309 | 0.154529 |
| 30 | Multiple Lines_No | 0.003493 | 0.146064 |
| 3 | Partner | 0.027072 | 0.145965 |
| 28 | Gender_Female | 0.008303 | 0.130632 |
| 14 | Online Backup | 0.014151 | 0.128032 |
| 43 | Payment Method_Mailed check | 0.001982 | 0.103367 |
| 18 | Streaming Movies | -0.007459 | 0.093174 |
| 33 | Internet Type_Cable | 0.013609 | 0.081805 |
| 15 | Device Protection Plan | 0.010290 | 0.075136 |
| 34 | Internet Type_DSL | 0.001940 | 0.066726 |
| 41 | Payment Method_Credit card (automatic) | -0.002274 | 0.062237 |
| 32 | Multiple Lines_Yes | 0.000740 | 0.055273 |
| 29 | Gender_Male | 0.000302 | 0.041088 |
| 38 | Contract_One year | -0.017796 | 0.041006 |
| 40 | Payment Method_Bank transfer (automatic) | 0.003491 | 0.037484 |
| 25 | Total Extra Data Charges | 0.004867 | 0.036689 |
| 24 | Total Refunds | 0.000743 | 0.035181 |
| 20 | Unlimited Data | 0.005330 | 0.027506 |
| 36 | Internet Type_No Internet | 0.001285 | 0.027077 |
| 35 | Internet Type_Fiber Optic | 0.003778 | 0.026590 |
| 11 | Internet Service | 0.006307 | 0.019949 |
| 1 | Under 30 | -0.000982 | 0.019594 |
| 2 | Senior Citizen | 0.006198 | 0.011788 |
| 9 | Phone Service | -0.001412 | 0.010889 |
| 31 | Multiple Lines_No phone service | 0.000000 | 0.000000 |
In [25]:
###############################################################################################################################################
###############################################################################################################################################
###############################################################################################################################################
###############################################################################################################################################
In [56]:
### Applying feature engineering techniques to the orignal dataframe to see if we can improve results.
### Creating a new DataFrame with engineered Features
telco_churn_engineered_features_df = telco_churn_df.copy()
########################################################################################################################################
### Bracketing Existing Columns
### Age Binning
age_bins = [-1, 10, 20, 30, 40, 50, 60, 70, 80]
age_bin_labels = ['0-10 Years', '11-20 Years', '21-30 Years', '31-40 Years', '41-50 Years', '51-60 Years', '61-70 Years', '71-80 Years']
telco_churn_engineered_features_df['Age_Bracket'] = pd.cut(telco_churn_engineered_features_df['Age'], bins = age_bins, labels = age_bin_labels, right = True)
### Tenure Binning
tenure_bins = [-1, 3, 6, 12, 18, 24, 36, 48, 60, 72]
tenure_bin_labels = ['0-3 Months', '4-6 Months', '7-12 Months', '13-18 Months', '19-24 Months', '25-36 Months', '37-48 Months', '49-60 Months', '61-72 Months']
telco_churn_engineered_features_df['Tenure_Bracket'] = pd.cut(telco_churn_engineered_features_df['Tenure'], bins = tenure_bins, labels = tenure_bin_labels, right = True)
### Monthly Charges Binning
monthly_charge_bins = [-1, 10, 20, 30, 40, 60, 80, 100, 120]
monthly_charge_bin_labels = ['$0-10', '$11-20', '$21-30', '$31-40', '$41-60', '$61-80', '$81-100', '$101-120']
telco_churn_engineered_features_df['Monthly_Charges_Bracket'] = pd.cut(telco_churn_engineered_features_df['Monthly Charges'], bins = monthly_charge_bins, labels = monthly_charge_bin_labels, right = True)
### Average Monthly GB Download Binning
avg_monthly_gb_bins = [-1, 5, 10, 15, 20, 25, 30, 40, 50, 60, 70, 80, 90]
avg_monthly_gb_bin_labels = ['0-5 Gbs', '6-10 Gbs', '11-15 Gbs', '16-20 Gbs', '21-25 Gbs', '26-30 Gbs', '31-40 Gbs', '41-50 Gbs', '51-60 Gbs', '61-70 Gbs', '71-80 Gbs', '81-90 Gbs']
telco_churn_engineered_features_df['AVG_Monthly_Gb_Bracket'] = pd.cut(telco_churn_engineered_features_df['Avg Monthly GB Download'], bins = avg_monthly_gb_bins, labels = avg_monthly_gb_bin_labels, right = True)
########################################################################################################################################
### Deriving New Columns
### Total Number of Services Column
service_columns = ['Phone Service', 'Internet Service', 'Online Security', 'Online Backup', 'Device Protection Plan', 'Premium Tech Support', 'Streaming TV', 'Streaming Movies', 'Streaming Music', 'Unlimited Data']
telco_churn_engineered_features_df['Total_Number_of_Services'] = telco_churn_engineered_features_df[service_columns].sum(axis = 1).astype(float)
### Service Penetration Rate Column
telco_churn_engineered_features_df['Service_Penetration_Rate'] = (telco_churn_engineered_features_df['Total_Number_of_Services'] / len(service_columns)).astype(float)
### Has Family Indicator Column
telco_churn_engineered_features_df['Has_Family'] = ((telco_churn_engineered_features_df['Partner'] == 1.0) | (telco_churn_engineered_features_df['Dependents'] == 1.0)).astype(float)
### Family Size Column
telco_churn_engineered_features_df['Family_Size'] = ((1.0 + telco_churn_engineered_features_df['Partner']) + telco_churn_engineered_features_df['Number of Dependents']).astype(float)
### Referral Rate
telco_churn_engineered_features_df['Referral_Rate'] = (telco_churn_engineered_features_df['Number of Referrals'] / (telco_churn_engineered_features_df['Tenure'] + 1.0)).astype(float)
### Refund Rate
telco_churn_engineered_features_df['Refund_Rate'] = (telco_churn_engineered_features_df['Total Refunds'] / (telco_churn_engineered_features_df['Total Charges'] + 1.0)).astype(float)
### Monthly Cost per GB
telco_churn_engineered_features_df['Monthly_Cost_Per_Gb'] = (telco_churn_engineered_features_df['Monthly Charges'] / (telco_churn_engineered_features_df['Avg Monthly GB Download'] + 1.0)).astype(float)
### Extra Charges Ratio
telco_churn_engineered_features_df['Extra_Charges_Ratio'] = (telco_churn_engineered_features_df['Total Extra Data Charges'] / (telco_churn_engineered_features_df['Total Charges'] + 1.0)).astype(float)
########################################################################################################################################
### Dropping the columns we applied binning techniques to.
telco_churn_engineered_features_df = telco_churn_engineered_features_df.drop(columns = ['Age', 'Tenure', 'Monthly Charges', 'Avg Monthly GB Download'])
########################################################################################################################################
### One-Hot-Encoding categorical/nominal columns
telco_churn_engineered_features_df_encoded = pd.get_dummies(telco_churn_engineered_features_df, columns=['Gender', 'Multiple Lines', 'Internet Type', 'Contract', 'Payment Method', 'Age_Bracket', 'Tenure_Bracket', 'Monthly_Charges_Bracket', 'AVG_Monthly_Gb_Bracket'], dtype = float)
### Excluding the churn column (the column that we are predicting)
telco_churn_engineered_features_df_encoded_excluding_churn = telco_churn_engineered_features_df_encoded.drop(columns = ['Churn'])
In [58]:
### Displaying the new Dataframe with engineered features.
print("telco_churn_engineered_features_df:")
display(telco_churn_engineered_features_df.head())
print('\n')
### Summary statistics
print("Summary statistics:")
display(telco_churn_engineered_features_df.describe(include = 'all'))
print('\n')
### Number of missing values
print("Number of missing values:")
display(telco_churn_engineered_features_df.isna().sum())
print('\n')
### Number of unique values
print("Number of unique values:")
display(telco_churn_engineered_features_df.nunique())
print('\n')
telco_churn_engineered_features_df:
| Gender | Under 30 | Senior Citizen | Partner | Dependents | Number of Dependents | Referred a Friend | Number of Referrals | Phone Service | Avg Monthly Long Distance Charges | Multiple Lines | Internet Service | Internet Type | Online Security | Online Backup | Device Protection Plan | Premium Tech Support | Streaming TV | Streaming Movies | Streaming Music | Unlimited Data | Contract | Paperless Billing | Payment Method | Total Charges | Total Refunds | Total Extra Data Charges | Total Long Distance Charges | Total Revenue | Churn | Age_Bracket | Tenure_Bracket | Monthly_Charges_Bracket | AVG_Monthly_Gb_Bracket | Total_Number_of_Services | Service_Penetration_Rate | Has_Family | Family_Size | Referral_Rate | Refund_Rate | Monthly_Cost_Per_Gb | Extra_Charges_Ratio | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Female | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 1.0 | 2.0 | 1.0 | 42.39 | No | 1.0 | Cable | 0.0 | 1.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 1.0 | One year | 1.0 | Mailed check | 593.30 | 0.00 | 0.0 | 381.51 | 974.81 | 0.0 | 31-40 Years | 7-12 Months | $61-80 | 16-20 Gbs | 6.0 | 0.6 | 1.0 | 2.0 | 0.200000 | 0.000000 | 3.858824 | 0.000000 |
| 1 | Male | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 10.69 | Yes | 1.0 | Cable | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | Month-to-month | 0.0 | Mailed check | 542.40 | 38.33 | 10.0 | 96.21 | 610.28 | 0.0 | 41-50 Years | 7-12 Months | $41-60 | 6-10 Gbs | 4.0 | 0.4 | 0.0 | 1.0 | 0.000000 | 0.070537 | 5.445455 | 0.018403 |
| 2 | Male | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 33.65 | No | 1.0 | Fiber Optic | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | Month-to-month | 1.0 | Electronic check | 280.85 | 0.00 | 0.0 | 134.60 | 415.45 | 1.0 | 41-50 Years | 4-6 Months | $61-80 | 26-30 Gbs | 4.0 | 0.4 | 0.0 | 1.0 | 0.000000 | 0.000000 | 2.383871 | 0.000000 |
| 3 | Male | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 1.0 | 1.0 | 1.0 | 27.82 | No | 1.0 | Fiber Optic | 0.0 | 1.0 | 1.0 | 0.0 | 1.0 | 1.0 | 0.0 | 1.0 | Month-to-month | 1.0 | Electronic check | 1237.85 | 0.00 | 0.0 | 361.66 | 1599.51 | 1.0 | 71-80 Years | 13-18 Months | $81-100 | 0-5 Gbs | 7.0 | 0.7 | 1.0 | 2.0 | 0.071429 | 0.000000 | 19.600000 | 0.000000 |
| 4 | Female | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 1.0 | 3.0 | 1.0 | 7.38 | No | 1.0 | Fiber Optic | 0.0 | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 1.0 | Month-to-month | 1.0 | Mailed check | 267.40 | 0.00 | 0.0 | 22.14 | 289.54 | 1.0 | 71-80 Years | 0-3 Months | $81-100 | 11-15 Gbs | 5.0 | 0.5 | 1.0 | 2.0 | 0.750000 | 0.000000 | 6.991667 | 0.000000 |
Summary statistics:
| Gender | Under 30 | Senior Citizen | Partner | Dependents | Number of Dependents | Referred a Friend | Number of Referrals | Phone Service | Avg Monthly Long Distance Charges | Multiple Lines | Internet Service | Internet Type | Online Security | Online Backup | Device Protection Plan | Premium Tech Support | Streaming TV | Streaming Movies | Streaming Music | Unlimited Data | Contract | Paperless Billing | Payment Method | Total Charges | Total Refunds | Total Extra Data Charges | Total Long Distance Charges | Total Revenue | Churn | Age_Bracket | Tenure_Bracket | Monthly_Charges_Bracket | AVG_Monthly_Gb_Bracket | Total_Number_of_Services | Service_Penetration_Rate | Has_Family | Family_Size | Referral_Rate | Refund_Rate | Monthly_Cost_Per_Gb | Extra_Charges_Ratio | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 7032 | 7032.000000 | 7032.000000 | 7032.000000 | 7032.000000 | 7032.000000 | 7032.000000 | 7032.000000 | 7032.000000 | 7032.000000 | 7032 | 7032.000000 | 7032 | 7032.000000 | 7032.000000 | 7032.000000 | 7032.000000 | 7032.000000 | 7032.000000 | 7032.000000 | 7032.000000 | 7032 | 7032.000000 | 7032 | 7032.000000 | 7032.000000 | 7032.000000 | 7032.000000 | 7032.000000 | 7032.000000 | 7032 | 7032 | 7032 | 7032 | 7032.000000 | 7032.000000 | 7032.000000 | 7032.000000 | 7032.000000 | 7032.000000 | 7032.000000 | 7032.000000 |
| unique | 2 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 3 | NaN | 4 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 3 | NaN | 4 | NaN | NaN | NaN | NaN | NaN | NaN | 7 | 9 | 7 | 12 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| top | Male | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | No | NaN | Fiber Optic | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | Month-to-month | NaN | Electronic check | NaN | NaN | NaN | NaN | NaN | NaN | 41-50 Years | 61-72 Months | $81-100 | 0-5 Gbs | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| freq | 3549 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 3385 | NaN | 3035 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 3875 | NaN | 2365 | NaN | NaN | NaN | NaN | NaN | NaN | 1311 | 1407 | 1763 | 2008 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| mean | NaN | 0.198663 | 0.162400 | 0.482509 | 0.298493 | 0.467577 | 0.456911 | 1.949232 | 0.903299 | 22.963471 | NaN | 0.783845 | NaN | 0.286547 | 0.344852 | 0.343857 | 0.290102 | 0.384386 | 0.388367 | 0.353669 | 0.674061 | NaN | 0.592719 | NaN | 2283.300441 | 1.965252 | 6.871445 | 749.957096 | 3038.163730 | 0.265785 | NaN | NaN | NaN | NaN | 4.752986 | 0.475299 | 0.533561 | 1.950085 | 0.092776 | 0.001741 | 8.559147 | 0.008423 |
| std | NaN | 0.399022 | 0.368844 | 0.499729 | 0.457629 | 0.962134 | 0.498175 | 3.001324 | 0.295571 | 15.449368 | NaN | 0.411650 | NaN | 0.452180 | 0.475354 | 0.475028 | 0.453842 | 0.486484 | 0.487414 | 0.478142 | 0.468758 | NaN | 0.491363 | NaN | 2266.771362 | 7.908412 | 25.123141 | 847.025001 | 2865.830234 | 0.441782 | NaN | NaN | NaN | NaN | 2.690764 | 0.269076 | 0.498908 | 1.219332 | 0.308979 | 0.010412 | 8.172502 | 0.068473 |
| min | NaN | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | NaN | 0.000000 | NaN | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | NaN | 0.000000 | NaN | 18.800000 | 0.000000 | 0.000000 | 0.000000 | 21.360000 | 0.000000 | NaN | NaN | NaN | NaN | 1.000000 | 0.100000 | 0.000000 | 1.000000 | 0.000000 | 0.000000 | 0.288953 | 0.000000 |
| 25% | NaN | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 9.210000 | NaN | 1.000000 | NaN | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | NaN | 0.000000 | NaN | 401.450000 | 0.000000 | 0.000000 | 70.567500 | 607.275000 | 0.000000 | NaN | NaN | NaN | NaN | 3.000000 | 0.300000 | 0.000000 | 1.000000 | 0.000000 | 0.000000 | 2.378927 | 0.000000 |
| 50% | NaN | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 22.890000 | NaN | 1.000000 | NaN | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | NaN | 1.000000 | NaN | 1397.475000 | 0.000000 | 0.000000 | 403.875000 | 2111.300000 | 0.000000 | NaN | NaN | NaN | NaN | 5.000000 | 0.500000 | 1.000000 | 2.000000 | 0.000000 | 0.000000 | 4.437747 | 0.000000 |
| 75% | NaN | 0.000000 | 0.000000 | 1.000000 | 1.000000 | 0.000000 | 1.000000 | 3.000000 | 1.000000 | 36.412500 | NaN | 1.000000 | NaN | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | NaN | 1.000000 | NaN | 3794.737500 | 0.000000 | 0.000000 | 1192.432500 | 4808.797500 | 1.000000 | NaN | NaN | NaN | NaN | 7.000000 | 0.700000 | 1.000000 | 2.000000 | 0.083333 | 0.000000 | 17.783750 | 0.000000 |
| max | NaN | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 9.000000 | 1.000000 | 11.000000 | 1.000000 | 49.990000 | NaN | 1.000000 | NaN | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | NaN | 1.000000 | NaN | 8684.800000 | 49.790000 | 150.000000 | 3564.720000 | 11979.340000 | 1.000000 | NaN | NaN | NaN | NaN | 10.000000 | 1.000000 | 1.000000 | 11.000000 | 5.000000 | 0.189417 | 38.316667 | 2.723735 |
Number of missing values:
Gender 0 Under 30 0 Senior Citizen 0 Partner 0 Dependents 0 Number of Dependents 0 Referred a Friend 0 Number of Referrals 0 Phone Service 0 Avg Monthly Long Distance Charges 0 Multiple Lines 0 Internet Service 0 Internet Type 0 Online Security 0 Online Backup 0 Device Protection Plan 0 Premium Tech Support 0 Streaming TV 0 Streaming Movies 0 Streaming Music 0 Unlimited Data 0 Contract 0 Paperless Billing 0 Payment Method 0 Total Charges 0 Total Refunds 0 Total Extra Data Charges 0 Total Long Distance Charges 0 Total Revenue 0 Churn 0 Age_Bracket 0 Tenure_Bracket 0 Monthly_Charges_Bracket 0 AVG_Monthly_Gb_Bracket 0 Total_Number_of_Services 0 Service_Penetration_Rate 0 Has_Family 0 Family_Size 0 Referral_Rate 0 Refund_Rate 0 Monthly_Cost_Per_Gb 0 Extra_Charges_Ratio 0 dtype: int64
Number of unique values:
Gender 2 Under 30 2 Senior Citizen 2 Partner 2 Dependents 2 Number of Dependents 10 Referred a Friend 2 Number of Referrals 12 Phone Service 2 Avg Monthly Long Distance Charges 3582 Multiple Lines 3 Internet Service 2 Internet Type 4 Online Security 2 Online Backup 2 Device Protection Plan 2 Premium Tech Support 2 Streaming TV 2 Streaming Movies 2 Streaming Music 2 Unlimited Data 2 Contract 3 Paperless Billing 2 Payment Method 4 Total Charges 6530 Total Refunds 500 Total Extra Data Charges 16 Total Long Distance Charges 6059 Total Revenue 6964 Churn 2 Age_Bracket 7 Tenure_Bracket 9 Monthly_Charges_Bracket 7 AVG_Monthly_Gb_Bracket 12 Total_Number_of_Services 10 Service_Penetration_Rate 10 Has_Family 2 Family_Size 11 Referral_Rate 437 Refund_Rate 526 Monthly_Cost_Per_Gb 5120 Extra_Charges_Ratio 727 dtype: int64
In [60]:
### Distribution of Newly Engineered Numerical Features
plt.figure(figsize = (18, 13))
### Total Number of Services Distribution
plt.subplot(2, 3, 1)
sns.histplot(telco_churn_engineered_features_df['Total_Number_of_Services'], kde = True)
plt.axvline(telco_churn_engineered_features_df['Total_Number_of_Services'].mean(), color = 'red', linestyle = '--', label = 'Mean')
plt.axvline(telco_churn_engineered_features_df['Total_Number_of_Services'].median(), color = 'green', linestyle = '-', label = 'Median')
plt.legend()
plt.title('Total Number of Services Distribution')
### Family Size Distribution
plt.subplot(2, 3, 2)
sns.histplot(telco_churn_engineered_features_df['Family_Size'], kde = True)
plt.axvline(telco_churn_engineered_features_df['Family_Size'].mean(), color = 'red', linestyle = '--', label = 'Mean')
plt.axvline(telco_churn_engineered_features_df['Family_Size'].median(), color = 'green', linestyle = '-', label = 'Median')
plt.legend()
plt.title('Family Size Distribution')
### Referral Rate Distribution
plt.subplot(2, 3, 3)
sns.histplot(telco_churn_engineered_features_df['Referral_Rate'], kde = True)
plt.axvline(telco_churn_engineered_features_df['Referral_Rate'].mean(), color = 'red', linestyle = '--', label = 'Mean')
plt.axvline(telco_churn_engineered_features_df['Referral_Rate'].median(), color = 'green', linestyle = '-', label = 'Median')
plt.legend()
plt.title('Referral Rate Distribution')
### Refund Rate Distribution
plt.subplot(2, 3, 4)
sns.histplot(telco_churn_engineered_features_df['Refund_Rate'], kde = True)
plt.axvline(telco_churn_engineered_features_df['Refund_Rate'].mean(), color = 'red', linestyle = '--', label = 'Mean')
plt.axvline(telco_churn_engineered_features_df['Refund_Rate'].median(), color = 'green', linestyle = '-', label = 'Median')
plt.legend()
plt.title('Refund Rate Distribution')
### Monthly Cost per GB Distribution
plt.subplot(2, 3, 5)
sns.histplot(telco_churn_engineered_features_df['Monthly_Cost_Per_Gb'], kde = True)
plt.axvline(telco_churn_engineered_features_df['Monthly_Cost_Per_Gb'].mean(), color = 'red', linestyle = '--', label = 'Mean')
plt.axvline(telco_churn_engineered_features_df['Monthly_Cost_Per_Gb'].median(), color = 'green', linestyle = '-', label = 'Median')
plt.legend()
plt.title('Monthly Cost per GB Distribution')
### Extra Charges Ratio
plt.subplot(2, 3, 6)
sns.histplot(telco_churn_engineered_features_df['Extra_Charges_Ratio'], kde = True)
plt.axvline(telco_churn_engineered_features_df['Extra_Charges_Ratio'].mean(), color = 'red', linestyle = '--', label = 'Mean')
plt.axvline(telco_churn_engineered_features_df['Extra_Charges_Ratio'].median(), color = 'green', linestyle = '-', label = 'Median')
plt.legend()
plt.title('Extra Charges Ratio Distribution')
plt.tight_layout()
plt.show()
In [61]:
### Churn vs Newly Engineered Numerical Features
plt.figure(figsize=(18, 13))
### Churn vs Total Number of Services
plt.subplot(2, 3, 1)
sns.boxplot(x = 'Churn', y = 'Total_Number_of_Services', data = telco_churn_engineered_features_df)
plt.title('Churn vs Total Number of Services')
### Churn vs Family Size
plt.subplot(2, 3, 2)
sns.boxplot(x = 'Churn', y = 'Family_Size', data = telco_churn_engineered_features_df)
plt.title('Churn vs Family Size')
### Churn vs Referral Rate
plt.subplot(2, 3, 3)
sns.boxplot(x = 'Churn', y = 'Referral_Rate', data = telco_churn_engineered_features_df)
plt.title('Churn vs Referral Rate')
### Churn vs Refund Rate
plt.subplot(2, 3, 4)
sns.boxplot(x = 'Churn', y = 'Refund_Rate', data = telco_churn_engineered_features_df)
plt.title('Churn vs Refund Rate')
### Churn vs Monthly Cost per GB
plt.subplot(2, 3, 5)
sns.boxplot(x = 'Churn', y = 'Monthly_Cost_Per_Gb', data = telco_churn_engineered_features_df)
plt.title('Churn vs Monthly Cost per GB')
### Churn vs Extra Charges Ratio
plt.subplot(2, 3, 6)
sns.boxplot(x = 'Churn', y = 'Extra_Charges_Ratio', data = telco_churn_engineered_features_df)
plt.title('Churn vs Extra Charges Ratio')
plt.tight_layout()
plt.show()
In [64]:
### Churn vs Newly Engineered Categorical Features
plt.figure(figsize = (20, 15))
### Churn vs Age Bracket
plt.subplot(2, 3, 1)
sns.countplot(x = 'Age_Bracket', hue = 'Churn', data = telco_churn_engineered_features_df)
plt.title('Churn vs Age Bracket')
plt.xticks(rotation = 45, ha = 'right')
### Churn vs Tenure Bracket
plt.subplot(2, 3, 2)
sns.countplot(x = 'Tenure_Bracket', hue = 'Churn', data = telco_churn_engineered_features_df)
plt.title('Churn vs Tenure Bracket')
plt.xticks(rotation = 45, ha = 'right')
### Churn vs Monthly Charges Bracket
plt.subplot(2, 3, 3)
sns.countplot(x = 'Monthly_Charges_Bracket', hue = 'Churn', data = telco_churn_engineered_features_df)
plt.title('Churn vs Monthly Charges Bracket')
plt.xticks(rotation = 45, ha = 'right')
### Churn vs Average Monthly GB Download
plt.subplot(2, 3, 4)
sns.countplot(x = 'AVG_Monthly_Gb_Bracket', hue = 'Churn', data = telco_churn_engineered_features_df)
plt.title('Churn vs Average Monthly GB Download')
plt.xticks(rotation = 45, ha = 'right')
### Churn vs Has Family Indicator
plt.subplot(2, 3, 5)
sns.countplot(x = 'Has_Family', hue = 'Churn', data = telco_churn_engineered_features_df)
plt.title('Churn vs Has Family')
plt.xticks(rotation = 45, ha = 'right')
plt.tight_layout()
plt.show()
In [68]:
### Rerun optimized models on telco df with engineered features
### Defining the input feature matrix (X) and the target variable vector (y).
telco_churn_engineered_features_X = telco_churn_engineered_features_df_encoded_excluding_churn.copy()
telco_churn_engineered_features_y = telco_churn_engineered_features_df_encoded['Churn']
### Creating a list of the continuous features that we will be scaling, we do not want to scale the binary indicator columns only the continuous numerical columns.
telco_churn_engineered_features_continuous_columns = [
'Number of Dependents',
'Number of Referrals',
'Avg Monthly Long Distance Charges',
'Total Charges',
'Total Refunds',
'Total Extra Data Charges',
'Total Long Distance Charges',
'Total Revenue',
'Total_Number_of_Services',
'Service_Penetration_Rate',
'Family_Size',
'Referral_Rate',
'Refund_Rate',
'Monthly_Cost_Per_Gb',
'Extra_Charges_Ratio'
]
#############################################################################################################################################################################################
### Defining Sampling/Scaling techniques and the classification models we are going to be comparing.
telco_churn_engineered_features_classification_models = {
'logistic_regression': LogisticRegression(random_state = 22),
'support_vector_classifier': SVC(probability = True, random_state = 22),
'decision_tree_classifier': DecisionTreeClassifier(random_state = 22),
'random_forest_classifier': RandomForestClassifier(random_state = 22),
'gaussian_naive_bayes_classifier': GaussianNB(),
'xgboost_classifier': XGBClassifier(random_state = 22)
}
telco_churn_engineered_features_scaling_techniques = {
'no_scaling': None,
'standard_scaler': StandardScaler(),
'robust_scaler': RobustScaler(),
'min_max_scaler': MinMaxScaler()
}
telco_churn_engineered_features_sampling_techniques = {
'no_sampling': None,
'random_undersampling': RandomUnderSampler(random_state = 22),
'random_oversampling': RandomOverSampler(random_state = 22),
'smote': SMOTE(random_state = 22)
}
#############################################################################################################################################################################################
### Optimized model configurations from gridsearch_results_list
telco_churn_engineered_features_optimized_models = {
entry['classification_model']: {
'scaling_technique': entry['scaling_technique'],
'sampling_technique': entry['sampling_technique'],
'best_hyperparameters': entry['best_hyperparameters']
}
for entry in gridsearch_results_list
}
#############################################################################################################################################################################################
### Setting up KFold crossvalidation with 5 splits.
telco_churn_engineered_features_cross_validation_split = KFold(n_splits = 5, shuffle = True, random_state = 22)
### Creating containers to store average model metrics information and ROC curve information (false positive rate and true positive rate arrays).
telco_churn_engineered_features_average_model_metrics = []
telco_churn_engineered_features_roc_data = {}
### Looping through each combination of model, scaling technique, and sampling technique.
for telco_churn_engineered_features_classification_model_name, telco_churn_engineered_features_classification_model_config in telco_churn_engineered_features_optimized_models.items():
### Getting classification model and set optimized hyperparameters
telco_churn_engineered_features_classification_model = telco_churn_engineered_features_classification_models[telco_churn_engineered_features_classification_model_name]
telco_churn_engineered_features_classification_model.set_params(**telco_churn_engineered_features_classification_model_config['best_hyperparameters'])
### Get scaling and sampling techniques
telco_churn_engineered_features_scaling_technique = telco_churn_engineered_features_scaling_techniques[telco_churn_engineered_features_classification_model_config['scaling_technique']]
telco_churn_engineered_features_sampling_technique = telco_churn_engineered_features_sampling_techniques[telco_churn_engineered_features_classification_model_config['sampling_technique']]
### Creating containers to store model metric information for each cross validation fold.
telco_churn_engineered_features_accuracy_scores_per_fold = []
telco_churn_engineered_features_precision_scores_per_fold = []
telco_churn_engineered_features_recall_scores_per_fold = []
telco_churn_engineered_features_f1_scores_per_fold = []
telco_churn_engineered_features_log_loss_values_per_fold = []
telco_churn_engineered_features_roc_auc_scores_per_fold = []
### Creating containers to store true labels and predicted probabilities across all folds for ROC curve calculations.
telco_churn_engineered_features_all_true_labels = []
telco_churn_engineered_features_all_probabilities = []
### Setting up cross validation dataset splits
for telco_churn_engineered_features_train_indexes, telco_churn_engineered_features_test_indexes in telco_churn_engineered_features_cross_validation_split.split(telco_churn_engineered_features_X, telco_churn_engineered_features_y):
telco_churn_engineered_features_X_train = telco_churn_engineered_features_X.iloc[telco_churn_engineered_features_train_indexes].copy()
telco_churn_engineered_features_X_test = telco_churn_engineered_features_X.iloc[telco_churn_engineered_features_test_indexes].copy()
telco_churn_engineered_features_y_train = telco_churn_engineered_features_y.iloc[telco_churn_engineered_features_train_indexes].copy()
telco_churn_engineered_features_y_test = telco_churn_engineered_features_y.iloc[telco_churn_engineered_features_test_indexes].copy()
### Applying the scaling techniques to only the continuous features defined above.
if telco_churn_engineered_features_scaling_technique is not None:
telco_churn_engineered_features_X_train.loc[:, telco_churn_engineered_features_continuous_columns] = telco_churn_engineered_features_scaling_technique.fit_transform(telco_churn_engineered_features_X_train[telco_churn_engineered_features_continuous_columns])
telco_churn_engineered_features_X_test.loc[:, telco_churn_engineered_features_continuous_columns] = telco_churn_engineered_features_scaling_technique.transform(telco_churn_engineered_features_X_test[telco_churn_engineered_features_continuous_columns])
### Applying the sampling techniques.
if telco_churn_engineered_features_sampling_technique is not None:
telco_churn_engineered_features_X_train, telco_churn_engineered_features_y_train = telco_churn_engineered_features_sampling_technique.fit_resample(telco_churn_engineered_features_X_train, telco_churn_engineered_features_y_train)
### Training the classification model.
telco_churn_engineered_features_classification_model.fit(telco_churn_engineered_features_X_train, telco_churn_engineered_features_y_train)
### Generating predictions and probabilities of churn on the testing dataset.
telco_churn_engineered_features_y_predicted_labels = telco_churn_engineered_features_classification_model.predict(telco_churn_engineered_features_X_test)
telco_churn_engineered_features_y_predicted_probabilities = telco_churn_engineered_features_classification_model.predict_proba(telco_churn_engineered_features_X_test)[:, 1]
### Calculating performance metrics for each cross validation fold.
telco_churn_engineered_features_accuracy_scores_per_fold.append(accuracy_score(telco_churn_engineered_features_y_test, telco_churn_engineered_features_y_predicted_labels))
telco_churn_engineered_features_precision_scores_per_fold.append(precision_score(telco_churn_engineered_features_y_test, telco_churn_engineered_features_y_predicted_labels))
telco_churn_engineered_features_recall_scores_per_fold.append(recall_score(telco_churn_engineered_features_y_test, telco_churn_engineered_features_y_predicted_labels))
telco_churn_engineered_features_f1_scores_per_fold.append(f1_score(telco_churn_engineered_features_y_test, telco_churn_engineered_features_y_predicted_labels))
telco_churn_engineered_features_log_loss_values_per_fold.append(log_loss(telco_churn_engineered_features_y_test, telco_churn_engineered_features_y_predicted_probabilities))
telco_churn_engineered_features_roc_auc_scores_per_fold.append(roc_auc_score(telco_churn_engineered_features_y_test, telco_churn_engineered_features_y_predicted_probabilities))
### Collecting data for ROC curve
telco_churn_engineered_features_all_true_labels.extend(telco_churn_engineered_features_y_test.tolist())
telco_churn_engineered_features_all_probabilities.extend(telco_churn_engineered_features_y_predicted_probabilities.tolist())
### Computing a single ROC curve for each combination across all folds.
telco_churn_engineered_features_false_positive_rates, telco_churn_engineered_features_true_positive_rates, _ = roc_curve(telco_churn_engineered_features_all_true_labels, telco_churn_engineered_features_all_probabilities)
### Storing averaged metrics across all folds for each combination.
telco_churn_engineered_features_average_model_metrics.append({
'classification_model': telco_churn_engineered_features_classification_model_name,
'scaling_technique': telco_churn_engineered_features_classification_model_config['scaling_technique'],
'sampling_technique': telco_churn_engineered_features_classification_model_config['sampling_technique'],
'accuracy': np.mean(telco_churn_engineered_features_accuracy_scores_per_fold),
'precision': np.mean(telco_churn_engineered_features_precision_scores_per_fold),
'recall': np.mean(telco_churn_engineered_features_recall_scores_per_fold),
'f1_score': np.mean(telco_churn_engineered_features_f1_scores_per_fold),
'log_loss': np.mean(telco_churn_engineered_features_log_loss_values_per_fold),
'roc_auc': np.mean(telco_churn_engineered_features_roc_auc_scores_per_fold)
})
### Storing ROC curve information for plotting
telco_churn_engineered_features_roc_data[(telco_churn_engineered_features_classification_model_name, telco_churn_engineered_features_classification_model_config['scaling_technique'], telco_churn_engineered_features_classification_model_config['sampling_technique'])] = (telco_churn_engineered_features_false_positive_rates, telco_churn_engineered_features_true_positive_rates)
### Converting to DataFrame
telco_churn_engineered_features_results_df = pd.DataFrame(telco_churn_engineered_features_average_model_metrics)
print("Performance on a Telco DF With Engineered Features:")
display(telco_churn_engineered_features_results_df.sort_values(by = 'f1_score', ascending = False))
Performance on a Telco DF With Engineered Features:
| classification_model | scaling_technique | sampling_technique | accuracy | precision | recall | f1_score | log_loss | roc_auc | |
|---|---|---|---|---|---|---|---|---|---|
| 1 | random_forest_classifier | standard_scaler | random_undersampling | 0.777302 | 0.552262 | 0.860853 | 0.672624 | 0.434098 | 0.884096 |
| 3 | logistic_regression | standard_scaler | random_oversampling | 0.777588 | 0.555098 | 0.821599 | 0.662332 | 0.433754 | 0.881479 |
| 0 | xgboost_classifier | no_scaling | random_oversampling | 0.817972 | 0.655546 | 0.667395 | 0.661047 | 0.453056 | 0.882279 |
| 5 | decision_tree_classifier | standard_scaler | random_undersampling | 0.747298 | 0.516246 | 0.846182 | 0.640395 | 0.610188 | 0.839351 |
| 4 | gaussian_naive_bayes_classifier | standard_scaler | smote | 0.778297 | 0.571152 | 0.687393 | 0.622214 | 2.851317 | 0.838566 |
| 2 | support_vector_classifier | standard_scaler | random_oversampling | 0.733077 | 0.459865 | 0.019299 | 0.036977 | 0.753589 | 0.764371 |
In [70]:
### Plotting the performance metrics of the optimized models on telco df with engineered features.
plt.figure(figsize = (20, 15))
### Plotting Accuracy Performance
plt.subplot(2, 3, 1)
sns.barplot(x = 'classification_model', y = 'accuracy', data = telco_churn_engineered_features_results_df.sort_values(by = 'accuracy', ascending = False))
plt.title('Model Accuracy')
plt.xticks(rotation = 45, ha = 'right')
### Plotting Precision Performance
plt.subplot(2, 3, 2)
sns.barplot(x = 'classification_model', y = 'precision', data = telco_churn_engineered_features_results_df.sort_values(by = 'precision', ascending = False))
plt.title('Model Precision')
plt.xticks(rotation = 45, ha = 'right')
### Plotting Recall Performance
plt.subplot(2, 3, 3)
sns.barplot(x = 'classification_model', y = 'recall', data = telco_churn_engineered_features_results_df.sort_values(by = 'recall', ascending = False))
plt.title('Model Recall')
plt.xticks(rotation = 45, ha = 'right')
### Plotting F1 Score Performance
plt.subplot(2, 3, 4)
sns.barplot(x = 'classification_model', y = 'f1_score', data = telco_churn_engineered_features_results_df.sort_values(by = 'f1_score', ascending = False))
plt.title('Model F1 Score')
plt.xticks(rotation = 45, ha = 'right')
### Plotting ROC AUC Performance
plt.subplot(2, 3, 5)
sns.barplot(x = 'classification_model', y = 'roc_auc', data = telco_churn_engineered_features_results_df.sort_values(by = 'roc_auc', ascending = False))
plt.title('Model ROC AUC')
plt.xticks(rotation = 45, ha = 'right')
plt.tight_layout()
plt.show()
In [72]:
### Looping through each optimized model and creating a ROC curve using the different telco dataframe.
for index, row in telco_churn_engineered_features_results_df.iterrows():
telco_churn_engineered_features_model_name = row['classification_model']
telco_churn_engineered_features_scaler_name = row['scaling_technique']
telco_churn_engineered_features_sampler_name = row['sampling_technique']
telco_churn_engineered_features_combination_key = (telco_churn_engineered_features_model_name, telco_churn_engineered_features_scaler_name, telco_churn_engineered_features_sampler_name)
if telco_churn_engineered_features_combination_key in telco_churn_engineered_features_roc_data:
telco_churn_engineered_features_false_positive_rate, telco_churn_engineered_features_true_positive_rate = telco_churn_engineered_features_roc_data[telco_churn_engineered_features_combination_key]
telco_churn_engineered_features_combination_label = f"{telco_churn_engineered_features_model_name} | {telco_churn_engineered_features_scaler_name} | {telco_churn_engineered_features_sampler_name}"
plt.figure(figsize = (15, 10))
sns.lineplot(x = telco_churn_engineered_features_false_positive_rate, y = telco_churn_engineered_features_true_positive_rate, label = telco_churn_engineered_features_combination_label)
plt.plot([0, 1], [0, 1], 'k--')
plt.title(f"ROC Curve of Optimized {telco_churn_engineered_features_model_name} Model On Telco DF With Engineered Features")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.legend(loc = "lower right")
plt.tight_layout()
plt.show()
In [78]:
### SHAP Feature Importances
### Extract the optimal XGBoost configuration from gridsearch_results_list
telco_churn_engineered_features_xgb_config = next(cfg for cfg in gridsearch_results_list if cfg['classification_model'] == 'xgboost_classifier')
### Initializing the XGBoost model with the optimized hyperparameters.
telco_churn_engineered_features_xgb_model = XGBClassifier(random_state = 22)
telco_churn_engineered_features_xgb_model.set_params(**telco_churn_engineered_features_xgb_config['best_hyperparameters'])
### Defining the input feature matrix (X) and the target variable vector (y).
telco_churn_engineered_features_X_SHAP = telco_churn_engineered_features_X.copy()
telco_churn_engineered_features_y_SHAP = telco_churn_engineered_features_y.copy()
### Applying the optimal scaling techniques.
if telco_churn_engineered_features_scaling_technique is not None:
telco_churn_engineered_features_X_SHAP[telco_churn_engineered_features_continuous_columns] = telco_churn_engineered_features_scaling_technique.fit_transform(telco_churn_engineered_features_X_SHAP[telco_churn_engineered_features_continuous_columns])
### Applying the optimal sampling techniques.
if telco_churn_engineered_features_sampling_technique is not None:
telco_churn_engineered_features_X_SHAP, telco_churn_engineered_features_y_SHAP = telco_churn_engineered_features_sampling_technique.fit_resample(telco_churn_engineered_features_X_SHAP, telco_churn_engineered_features_y_SHAP)
### Fitting the model.
telco_churn_engineered_features_xgb_model.fit(telco_churn_engineered_features_X_SHAP, telco_churn_engineered_features_y_SHAP)
### SHAP analysis
telco_churn_engineered_features_explainer = shap.TreeExplainer(telco_churn_engineered_features_xgb_model)
telco_churn_engineered_features_shap_values = telco_churn_engineered_features_explainer.shap_values(telco_churn_engineered_features_X_SHAP)
### Creating a feature importance dataframe.
telco_churn_engineered_features_shap_df = pd.DataFrame({
'Feature': telco_churn_engineered_features_X_SHAP.columns,
'Mean_SHAP_value': telco_churn_engineered_features_shap_values.mean(axis = 0),
'Mean_ABS_SHAP_value': np.abs(telco_churn_engineered_features_shap_values).mean(axis = 0)
}).sort_values('Mean_ABS_SHAP_value', ascending = False)
### Plotting the top 10 Mean SHAP Feature Contributions sorted by mean absolute shap values
plt.figure(figsize = (10, 6))
sns.barplot(data = telco_churn_engineered_features_shap_df.head(10), x = 'Mean_SHAP_value', y = 'Feature')
plt.title('Top 10 Mean SHAP Feature Contributions (via XGBoost)')
plt.xlabel('Mean SHAP Value (Direction + Magnitude)')
plt.ylabel('Features')
plt.axvline(0, color = 'black', linestyle = '--')
plt.tight_layout()
plt.show()
print('\n')
### Displaying the feature importance dataframe.
print('telco_churn_engineered_features_shap_df:')
display(telco_churn_engineered_features_shap_df)
print('\n')
telco_churn_engineered_features_shap_df:
| Feature | Mean_SHAP_value | Mean_ABS_SHAP_value | |
|---|---|---|---|
| 6 | Number of Referrals | -0.377705 | 1.043854 |
| 41 | Contract_Month-to-month | -0.049816 | 0.892400 |
| 4 | Number of Dependents | -0.255171 | 0.552767 |
| 23 | Total Revenue | -0.052648 | 0.494655 |
| 22 | Total Long Distance Charges | 0.013714 | 0.463415 |
| 19 | Total Charges | 0.010784 | 0.402442 |
| 28 | Referral_Rate | 0.103467 | 0.389254 |
| 8 | Avg Monthly Long Distance Charges | -0.021327 | 0.384105 |
| 30 | Monthly_Cost_Per_Gb | 0.067361 | 0.374549 |
| 3 | Dependents | 0.235145 | 0.351759 |
| 46 | Payment Method_Electronic check | -0.004498 | 0.290373 |
| 14 | Streaming TV | 0.001937 | 0.281954 |
| 18 | Paperless Billing | 0.003067 | 0.275497 |
| 39 | Internet Type_Fiber Optic | -0.015395 | 0.247086 |
| 13 | Premium Tech Support | 0.001971 | 0.210998 |
| 10 | Online Security | 0.009443 | 0.187707 |
| 24 | Total_Number_of_Services | 0.012996 | 0.178623 |
| 43 | Contract_Two year | -0.054211 | 0.173447 |
| 27 | Family_Size | -0.066315 | 0.155510 |
| 0 | Under 30 | 0.004008 | 0.145398 |
| 16 | Streaming Music | -0.002594 | 0.145162 |
| 36 | Multiple Lines_Yes | -0.004491 | 0.133333 |
| 47 | Payment Method_Mailed check | 0.013803 | 0.130934 |
| 72 | Monthly_Charges_Bracket_$101-120 | -0.020771 | 0.129620 |
| 38 | Internet Type_DSL | -0.016526 | 0.124929 |
| 32 | Gender_Female | 0.006253 | 0.122523 |
| 34 | Multiple Lines_No | 0.004372 | 0.115301 |
| 51 | Age_Bracket_31-40 Years | 0.008253 | 0.104612 |
| 5 | Referred a Friend | 0.020740 | 0.086187 |
| 55 | Age_Bracket_71-80 Years | 0.005685 | 0.085441 |
| 52 | Age_Bracket_41-50 Years | 0.000371 | 0.083246 |
| 53 | Age_Bracket_51-60 Years | 0.003793 | 0.078882 |
| 64 | Tenure_Bracket_61-72 Months | -0.007963 | 0.075677 |
| 15 | Streaming Movies | -0.010532 | 0.074948 |
| 11 | Online Backup | -0.007058 | 0.072363 |
| 25 | Service_Penetration_Rate | 0.004979 | 0.071256 |
| 26 | Has_Family | 0.009771 | 0.065870 |
| 58 | Tenure_Bracket_7-12 Months | -0.000361 | 0.061395 |
| 78 | AVG_Monthly_Gb_Bracket_26-30 Gbs | 0.007308 | 0.061051 |
| 31 | Extra_Charges_Ratio | 0.012582 | 0.060037 |
| 61 | Tenure_Bracket_25-36 Months | 0.004407 | 0.059527 |
| 2 | Partner | 0.023518 | 0.059469 |
| 45 | Payment Method_Credit card (automatic) | -0.001407 | 0.059443 |
| 70 | Monthly_Charges_Bracket_$61-80 | 0.003179 | 0.059420 |
| 12 | Device Protection Plan | 0.002173 | 0.057822 |
| 57 | Tenure_Bracket_4-6 Months | 0.002894 | 0.057816 |
| 37 | Internet Type_Cable | 0.007662 | 0.057108 |
| 77 | AVG_Monthly_Gb_Bracket_21-25 Gbs | 0.002853 | 0.053670 |
| 69 | Monthly_Charges_Bracket_$41-60 | -0.004452 | 0.053620 |
| 71 | Monthly_Charges_Bracket_$81-100 | 0.008879 | 0.053452 |
| 76 | AVG_Monthly_Gb_Bracket_16-20 Gbs | 0.005662 | 0.051963 |
| 1 | Senior Citizen | 0.007383 | 0.048752 |
| 74 | AVG_Monthly_Gb_Bracket_6-10 Gbs | 0.000452 | 0.048520 |
| 75 | AVG_Monthly_Gb_Bracket_11-15 Gbs | 0.000055 | 0.047875 |
| 62 | Tenure_Bracket_37-48 Months | -0.004595 | 0.047206 |
| 9 | Internet Service | 0.000553 | 0.047193 |
| 50 | Age_Bracket_21-30 Years | 0.003050 | 0.042104 |
| 44 | Payment Method_Bank transfer (automatic) | 0.005332 | 0.041601 |
| 67 | Monthly_Charges_Bracket_$21-30 | -0.000584 | 0.040808 |
| 59 | Tenure_Bracket_13-18 Months | 0.006856 | 0.039669 |
| 42 | Contract_One year | -0.022006 | 0.038567 |
| 73 | AVG_Monthly_Gb_Bracket_0-5 Gbs | 0.012378 | 0.036804 |
| 54 | Age_Bracket_61-70 Years | 0.000095 | 0.033667 |
| 21 | Total Extra Data Charges | -0.003531 | 0.032604 |
| 20 | Total Refunds | -0.003600 | 0.029020 |
| 17 | Unlimited Data | -0.004162 | 0.027893 |
| 66 | Monthly_Charges_Bracket_$11-20 | 0.002489 | 0.027541 |
| 56 | Tenure_Bracket_0-3 Months | -0.007834 | 0.025784 |
| 33 | Gender_Male | 0.003348 | 0.024810 |
| 29 | Refund_Rate | 0.002309 | 0.024133 |
| 81 | AVG_Monthly_Gb_Bracket_51-60 Gbs | 0.001319 | 0.023855 |
| 60 | Tenure_Bracket_19-24 Months | 0.000909 | 0.022555 |
| 63 | Tenure_Bracket_49-60 Months | -0.002092 | 0.020609 |
| 80 | AVG_Monthly_Gb_Bracket_41-50 Gbs | 0.000388 | 0.017624 |
| 68 | Monthly_Charges_Bracket_$31-40 | -0.001083 | 0.013643 |
| 49 | Age_Bracket_11-20 Years | 0.002020 | 0.012581 |
| 83 | AVG_Monthly_Gb_Bracket_71-80 Gbs | 0.000724 | 0.008386 |
| 82 | AVG_Monthly_Gb_Bracket_61-70 Gbs | 0.000521 | 0.005047 |
| 7 | Phone Service | 0.002423 | 0.004384 |
| 84 | AVG_Monthly_Gb_Bracket_81-90 Gbs | 0.000235 | 0.003170 |
| 35 | Multiple Lines_No phone service | 0.000000 | 0.000000 |
| 40 | Internet Type_No Internet | 0.000000 | 0.000000 |
| 48 | Age_Bracket_0-10 Years | 0.000000 | 0.000000 |
| 65 | Monthly_Charges_Bracket_$0-10 | 0.000000 | 0.000000 |
| 79 | AVG_Monthly_Gb_Bracket_31-40 Gbs | 0.000000 | 0.000000 |
In [ ]:
###############################################################################################################################################
###############################################################################################################################################
###############################################################################################################################################
###############################################################################################################################################
In [80]:
### Loading another telco customer churn dataset to compare model performance on different data from a similar industry.
diff_telco_url = "https://raw.githubusercontent.com/adamcookeunc/DATA_780_PROJECT/refs/heads/main/customer_churn_dataset-testing-master.csv"
diff_telco_churn_df = pd.read_csv(diff_telco_url)
diff_telco_churn_df.head()
Out[80]:
| CustomerID | Age | Gender | Tenure | Usage Frequency | Support Calls | Payment Delay | Subscription Type | Contract Length | Total Spend | Last Interaction | Churn | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 22 | Female | 25 | 14 | 4 | 27 | Basic | Monthly | 598 | 9 | 1 |
| 1 | 2 | 41 | Female | 28 | 28 | 7 | 13 | Standard | Monthly | 584 | 20 | 0 |
| 2 | 3 | 47 | Male | 27 | 10 | 2 | 29 | Premium | Annual | 757 | 21 | 0 |
| 3 | 4 | 35 | Male | 9 | 12 | 5 | 17 | Premium | Quarterly | 232 | 18 | 0 |
| 4 | 5 | 53 | Female | 58 | 24 | 9 | 2 | Standard | Annual | 533 | 18 | 0 |
In [82]:
### Dropping the CustomerID column because it is irrelavent.
diff_telco_churn_df = diff_telco_churn_df.drop(columns = ['CustomerID'])
### Ensuring that all numerical columns are in the float 64 formatting.
diff_telco_churn_df['Age'] = diff_telco_churn_df['Age'].astype(float)
diff_telco_churn_df['Tenure'] = diff_telco_churn_df['Tenure'].astype(float)
diff_telco_churn_df['Usage Frequency'] = diff_telco_churn_df['Usage Frequency'].astype(float)
diff_telco_churn_df['Support Calls'] = diff_telco_churn_df['Support Calls'].astype(float)
diff_telco_churn_df['Payment Delay'] = diff_telco_churn_df['Payment Delay'].astype(float)
diff_telco_churn_df['Total Spend'] = diff_telco_churn_df['Total Spend'].astype(float)
diff_telco_churn_df['Last Interaction'] = diff_telco_churn_df['Last Interaction'].astype(float)
diff_telco_churn_df['Churn'] = diff_telco_churn_df['Churn'].astype(float)
### One-Hot-Encoding categorical/nominal columns
diff_telco_churn_df_encoded = pd.get_dummies(diff_telco_churn_df, columns = ['Gender', 'Subscription Type', 'Contract Length'], dtype = float)
### Excluding the churn column (the column that we are predicting)
diff_telco_churn_df_encoded_excluding_churn = diff_telco_churn_df_encoded.drop(columns = ['Churn'])
In [84]:
### Summary statistics
print("Summary statistics:")
display(diff_telco_churn_df.describe(include = 'all'))
print('\n')
### Data types
print("Data types:")
display(diff_telco_churn_df.dtypes)
print('\n')
### Number of missing values
print("Number of missing values:")
display(diff_telco_churn_df.isna().sum())
print('\n')
### Number of unique values
print("Number of unique values:")
display(diff_telco_churn_df.nunique())
print('\n')
Summary statistics:
| Age | Gender | Tenure | Usage Frequency | Support Calls | Payment Delay | Subscription Type | Contract Length | Total Spend | Last Interaction | Churn | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 64374.000000 | 64374 | 64374.000000 | 64374.000000 | 64374.000000 | 64374.000000 | 64374 | 64374 | 64374.000000 | 64374.000000 | 64374.000000 |
| unique | NaN | 2 | NaN | NaN | NaN | NaN | 3 | 3 | NaN | NaN | NaN |
| top | NaN | Female | NaN | NaN | NaN | NaN | Standard | Monthly | NaN | NaN | NaN |
| freq | NaN | 34353 | NaN | NaN | NaN | NaN | 21502 | 22130 | NaN | NaN | NaN |
| mean | 41.970982 | NaN | 31.994827 | 15.080234 | 5.400690 | 17.133952 | NaN | NaN | 541.023379 | 15.498850 | 0.473685 |
| std | 13.924911 | NaN | 17.098234 | 8.816470 | 3.114005 | 8.852211 | NaN | NaN | 260.874809 | 8.638436 | 0.499311 |
| min | 18.000000 | NaN | 1.000000 | 1.000000 | 0.000000 | 0.000000 | NaN | NaN | 100.000000 | 1.000000 | 0.000000 |
| 25% | 30.000000 | NaN | 18.000000 | 7.000000 | 3.000000 | 10.000000 | NaN | NaN | 313.000000 | 8.000000 | 0.000000 |
| 50% | 42.000000 | NaN | 33.000000 | 15.000000 | 6.000000 | 19.000000 | NaN | NaN | 534.000000 | 15.000000 | 0.000000 |
| 75% | 54.000000 | NaN | 47.000000 | 23.000000 | 8.000000 | 25.000000 | NaN | NaN | 768.000000 | 23.000000 | 1.000000 |
| max | 65.000000 | NaN | 60.000000 | 30.000000 | 10.000000 | 30.000000 | NaN | NaN | 1000.000000 | 30.000000 | 1.000000 |
Data types:
Age float64 Gender object Tenure float64 Usage Frequency float64 Support Calls float64 Payment Delay float64 Subscription Type object Contract Length object Total Spend float64 Last Interaction float64 Churn float64 dtype: object
Number of missing values:
Age 0 Gender 0 Tenure 0 Usage Frequency 0 Support Calls 0 Payment Delay 0 Subscription Type 0 Contract Length 0 Total Spend 0 Last Interaction 0 Churn 0 dtype: int64
Number of unique values:
Age 48 Gender 2 Tenure 60 Usage Frequency 30 Support Calls 11 Payment Delay 31 Subscription Type 3 Contract Length 3 Total Spend 901 Last Interaction 30 Churn 2 dtype: int64
In [86]:
### Distribution of Churn (The target variable) on a different telco df
plt.figure(figsize = (8, 6))
sns.countplot(x = 'Churn', data = diff_telco_churn_df)
plt.title('Churn Distribution of a Different Telco DF')
plt.show()
print('\n')
print("Churn Distribution:")
print(diff_telco_churn_df['Churn'].value_counts())
print('\n')
print("Churn Distribution Percentages:")
print(diff_telco_churn_df['Churn'].value_counts(normalize = True) * 100)
print('\n')
Churn Distribution: Churn 0.0 33881 1.0 30493 Name: count, dtype: int64 Churn Distribution Percentages: Churn 0.0 52.631497 1.0 47.368503 Name: proportion, dtype: float64
In [88]:
### Distribution of Different Telco DF Numerical Features
plt.figure(figsize = (18, 13))
### Age Distribution
plt.subplot(3, 3, 1)
sns.histplot(diff_telco_churn_df['Age'], kde = True)
plt.axvline(diff_telco_churn_df['Age'].mean(), color = 'red', linestyle = '--', label = 'Mean')
plt.axvline(diff_telco_churn_df['Age'].median(), color = 'green', linestyle = '-', label = 'Median')
plt.legend()
plt.title('Age Distribution')
### Tenure Distribution
plt.subplot(3, 3, 2)
sns.histplot(diff_telco_churn_df['Tenure'], kde = True)
plt.axvline(diff_telco_churn_df['Tenure'].mean(), color = 'red', linestyle = '--', label = 'Mean')
plt.axvline(diff_telco_churn_df['Tenure'].median(), color = 'green', linestyle = '-', label = 'Median')
plt.legend()
plt.title('Tenure Distribution')
### Usage Frequency Distribution
plt.subplot(3, 3, 3)
sns.histplot(diff_telco_churn_df['Usage Frequency'], kde = True)
plt.axvline(diff_telco_churn_df['Usage Frequency'].mean(), color = 'red', linestyle = '--', label = 'Mean')
plt.axvline(diff_telco_churn_df['Usage Frequency'].median(), color = 'green', linestyle = '-', label = 'Median')
plt.legend()
plt.title('Usage Frequency Distribution')
### Support Calls Distribution
plt.subplot(3, 3, 4)
sns.histplot(diff_telco_churn_df['Support Calls'], kde = True)
plt.axvline(diff_telco_churn_df['Support Calls'].mean(), color = 'red', linestyle = '--', label = 'Mean')
plt.axvline(diff_telco_churn_df['Support Calls'].median(), color = 'green', linestyle = '-', label = 'Median')
plt.legend()
plt.title('Support Calls Distribution')
### Payment Delay Distribution
plt.subplot(3, 3, 5)
sns.histplot(diff_telco_churn_df['Payment Delay'], kde = True)
plt.axvline(diff_telco_churn_df['Payment Delay'].mean(), color = 'red', linestyle = '--', label = 'Mean')
plt.axvline(diff_telco_churn_df['Payment Delay'].median(), color = 'green', linestyle = '-', label = 'Median')
plt.legend()
plt.title('Payment Delay Distribution')
### Total Spend Ratio
plt.subplot(3, 3, 6)
sns.histplot(diff_telco_churn_df['Total Spend'], kde = True)
plt.axvline(diff_telco_churn_df['Total Spend'].mean(), color = 'red', linestyle = '--', label = 'Mean')
plt.axvline(diff_telco_churn_df['Total Spend'].median(), color = 'green', linestyle = '-', label = 'Median')
plt.legend()
plt.title('Total Spend Distribution')
### Last Interaction Ratio
plt.subplot(3, 3, 7)
sns.histplot(diff_telco_churn_df['Last Interaction'], kde = True)
plt.axvline(diff_telco_churn_df['Last Interaction'].mean(), color = 'red', linestyle = '--', label = 'Mean')
plt.axvline(diff_telco_churn_df['Last Interaction'].median(), color = 'green', linestyle = '-', label = 'Median')
plt.legend()
plt.title('Last Interaction Distribution')
plt.tight_layout()
plt.show()
In [92]:
### Churn vs Different Telco DF Numerical Features
plt.figure(figsize = (18, 13))
### Churn vs Age
plt.subplot(3, 3, 1)
sns.boxplot(x = 'Churn', y = 'Age', data = diff_telco_churn_df)
plt.title('Churn vs Age')
### Churn vs Tenure
plt.subplot(3, 3, 2)
sns.boxplot(x = 'Churn', y = 'Tenure', data = diff_telco_churn_df)
plt.title('Churn vs Tenure')
### Churn vs Usage Frequency
plt.subplot(3, 3, 3)
sns.boxplot(x = 'Churn', y = 'Usage Frequency', data = diff_telco_churn_df)
plt.title('Churn vs Usage Frequency')
### Churn vs Support Calls
plt.subplot(3, 3, 4)
sns.boxplot(x = 'Churn', y = 'Support Calls', data = diff_telco_churn_df)
plt.title('Churn vs Support Calls')
### Churn vs Payment Delay
plt.subplot(3, 3, 5)
sns.boxplot(x = 'Churn', y = 'Payment Delay', data = diff_telco_churn_df)
plt.title('Churn vs Payment Delay')
### Churn vs Total Spend
plt.subplot(3, 3, 6)
sns.boxplot(x = 'Churn', y = 'Total Spend', data = diff_telco_churn_df)
plt.title('Churn vs Total Spend')
### Churn vs Last Interaction
plt.subplot(3, 3, 7)
sns.boxplot(x = 'Churn', y = 'Last Interaction', data = diff_telco_churn_df)
plt.title('Churn vs Last Interaction')
plt.tight_layout()
plt.show()
In [94]:
### Churn vs Different Telco DF Categorical Features
plt.figure(figsize = (20, 10))
### Churn vs Gender
plt.subplot(1, 3, 1)
sns.countplot(x = 'Gender', hue = 'Churn', data = diff_telco_churn_df)
plt.title('Churn vs Gender')
plt.xticks(rotation = 45, ha = 'right')
### Churn vs Subscription Type
plt.subplot(1, 3, 2)
sns.countplot(x = 'Subscription Type', hue = 'Churn', data = diff_telco_churn_df)
plt.title('Churn vs Subscription Type')
plt.xticks(rotation = 45, ha = 'right')
### Churn vs Contract Length
plt.subplot(1, 3, 3)
sns.countplot(x = 'Contract Length', hue = 'Churn', data = diff_telco_churn_df)
plt.title('Churn vs Contract Length')
plt.xticks(rotation = 45, ha = 'right')
plt.tight_layout()
plt.show()
In [96]:
### Rerun optimized models on diff telco df
### Defining the input feature matrix (X) and the target variable vector (y).
diff_telco_X = diff_telco_churn_df_encoded_excluding_churn.copy()
diff_telco_y = diff_telco_churn_df_encoded['Churn']
### Creating a list of the continuous features that we will be scaling, we do not want to scale the binary indicator columns only the continuous numerical columns.
diff_telco_continuous_columns = [
'Age',
'Tenure',
'Usage Frequency',
'Support Calls',
'Payment Delay',
'Total Spend',
'Last Interaction',
]
#############################################################################################################################################################################################
### Defining Sampling/Scaling techniques and the classification models we are going to be comparing.
diff_telco_classification_models = {
'logistic_regression': LogisticRegression(random_state = 22),
'support_vector_classifier': SVC(probability = True, random_state = 22),
'decision_tree_classifier': DecisionTreeClassifier(random_state = 22),
'random_forest_classifier': RandomForestClassifier(random_state = 22),
'gaussian_naive_bayes_classifier': GaussianNB(),
'xgboost_classifier': XGBClassifier(random_state = 22)
}
diff_telco_scaling_techniques = {
'no_scaling': None,
'standard_scaler': StandardScaler(),
'robust_scaler': RobustScaler(),
'min_max_scaler': MinMaxScaler()
}
diff_telco_sampling_techniques = {
'no_sampling': None,
'random_undersampling': RandomUnderSampler(random_state = 22),
'random_oversampling': RandomOverSampler(random_state = 22),
'smote': SMOTE(random_state = 22)
}
#############################################################################################################################################################################################
### Optimized model configurations from gridsearch_results_list
diff_telco_optimized_models = {
entry['classification_model']: {
'scaling_technique': entry['scaling_technique'],
'sampling_technique': entry['sampling_technique'],
'best_hyperparameters': entry['best_hyperparameters']
}
for entry in gridsearch_results_list
}
#############################################################################################################################################################################################
### Setting up KFold crossvalidation with 5 splits.
diff_telco_cross_validation_split = KFold(n_splits = 5, shuffle = True, random_state = 22)
### Creating containers to store average model metrics information and ROC curve information (false positive rate and true positive rate arrays).
diff_telco_average_model_metrics = []
diff_telco_roc_data = {}
### Looping through each combination of model, scaling technique, and sampling technique.
for diff_telco_classification_model_name, diff_telco_classification_model_config in diff_telco_optimized_models.items():
### Getting classification model and set optimized hyperparameters
diff_telco_classification_model = diff_telco_classification_models[diff_telco_classification_model_name]
diff_telco_classification_model.set_params(**diff_telco_classification_model_config['best_hyperparameters'])
### Get scaling and sampling techniques
diff_telco_scaling_technique = diff_telco_scaling_techniques[diff_telco_classification_model_config['scaling_technique']]
diff_telco_sampling_technique = diff_telco_sampling_techniques[diff_telco_classification_model_config['sampling_technique']]
### Creating containers to store model metric information for each cross validation fold.
diff_telco_accuracy_scores_per_fold = []
diff_telco_precision_scores_per_fold = []
diff_telco_recall_scores_per_fold = []
diff_telco_f1_scores_per_fold = []
diff_telco_log_loss_values_per_fold = []
diff_telco_roc_auc_scores_per_fold = []
### Creating containers to store true labels and predicted probabilities across all folds for ROC curve calculations.
diff_telco_all_true_labels = []
diff_telco_all_probabilities = []
### Setting up cross validation dataset splits
for diff_telco_train_indexes, diff_telco_test_indexes in diff_telco_cross_validation_split.split(diff_telco_X, diff_telco_y):
diff_telco_X_train = diff_telco_X.iloc[diff_telco_train_indexes].copy()
diff_telco_X_test = diff_telco_X.iloc[diff_telco_test_indexes].copy()
diff_telco_y_train = diff_telco_y.iloc[diff_telco_train_indexes].copy()
diff_telco_y_test = diff_telco_y.iloc[diff_telco_test_indexes].copy()
### Applying the scaling techniques to only the continuous features defined above.
if diff_telco_scaling_technique is not None:
diff_telco_X_train.loc[:, diff_telco_continuous_columns] = diff_telco_scaling_technique.fit_transform(diff_telco_X_train[diff_telco_continuous_columns])
diff_telco_X_test.loc[:, diff_telco_continuous_columns] = diff_telco_scaling_technique.transform(diff_telco_X_test[diff_telco_continuous_columns])
### Applying the sampling techniques.
if diff_telco_sampling_technique is not None:
diff_telco_X_train, diff_telco_y_train = diff_telco_sampling_technique.fit_resample(diff_telco_X_train, diff_telco_y_train)
### Training the classification model.
diff_telco_classification_model.fit(diff_telco_X_train, diff_telco_y_train)
### Generating predictions and probabilities of churn on the testing dataset.
diff_telco_y_predicted_labels = diff_telco_classification_model.predict(diff_telco_X_test)
diff_telco_y_predicted_probabilities = diff_telco_classification_model.predict_proba(diff_telco_X_test)[:, 1]
### Calculating performance metrics for each cross validation fold.
diff_telco_accuracy_scores_per_fold.append(accuracy_score(diff_telco_y_test, diff_telco_y_predicted_labels))
diff_telco_precision_scores_per_fold.append(precision_score(diff_telco_y_test, diff_telco_y_predicted_labels))
diff_telco_recall_scores_per_fold.append(recall_score(diff_telco_y_test, diff_telco_y_predicted_labels))
diff_telco_f1_scores_per_fold.append(f1_score(diff_telco_y_test, diff_telco_y_predicted_labels))
diff_telco_log_loss_values_per_fold.append(log_loss(diff_telco_y_test, diff_telco_y_predicted_probabilities))
diff_telco_roc_auc_scores_per_fold.append(roc_auc_score(diff_telco_y_test, diff_telco_y_predicted_probabilities))
### Collecting data for ROC curve
diff_telco_all_true_labels.extend(diff_telco_y_test.tolist())
diff_telco_all_probabilities.extend(diff_telco_y_predicted_probabilities.tolist())
### Computing a single ROC curve for each combination across all folds.
diff_telco_false_positive_rates, diff_telco_true_positive_rates, _ = roc_curve(diff_telco_all_true_labels, diff_telco_all_probabilities)
### Storing averaged metrics across all folds for each combination.
diff_telco_average_model_metrics.append({
'classification_model': diff_telco_classification_model_name,
'scaling_technique': diff_telco_classification_model_config['scaling_technique'],
'sampling_technique': diff_telco_classification_model_config['sampling_technique'],
'accuracy': np.mean(diff_telco_accuracy_scores_per_fold),
'precision': np.mean(diff_telco_precision_scores_per_fold),
'recall': np.mean(diff_telco_recall_scores_per_fold),
'f1_score': np.mean(diff_telco_f1_scores_per_fold),
'log_loss': np.mean(diff_telco_log_loss_values_per_fold),
'roc_auc': np.mean(diff_telco_roc_auc_scores_per_fold)
})
### Storing ROC curve information for plotting
diff_telco_roc_data[(diff_telco_classification_model_name, diff_telco_classification_model_config['scaling_technique'], diff_telco_classification_model_config['sampling_technique'])] = (diff_telco_false_positive_rates, diff_telco_true_positive_rates)
### Converting to DataFrame
diff_telco_results_df = pd.DataFrame(diff_telco_average_model_metrics)
print("Performance on a different Telco DF:")
display(diff_telco_results_df.sort_values(by = 'f1_score', ascending = False))
Performance on a different Telco DF:
| classification_model | scaling_technique | sampling_technique | accuracy | precision | recall | f1_score | log_loss | roc_auc | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | xgboost_classifier | no_scaling | random_oversampling | 0.999922 | 0.999869 | 0.999968 | 0.999918 | 0.000582 | 0.999992 |
| 1 | random_forest_classifier | standard_scaler | random_undersampling | 0.998742 | 0.998261 | 0.999082 | 0.998671 | 0.004244 | 0.999978 |
| 5 | decision_tree_classifier | standard_scaler | random_undersampling | 0.956613 | 0.930023 | 0.982346 | 0.955465 | 0.088584 | 0.994410 |
| 2 | support_vector_classifier | standard_scaler | random_oversampling | 0.942228 | 0.934457 | 0.944284 | 0.939341 | 0.143388 | 0.987544 |
| 4 | gaussian_naive_bayes_classifier | standard_scaler | smote | 0.835151 | 0.800956 | 0.867575 | 0.832931 | 0.403325 | 0.908518 |
| 3 | logistic_regression | standard_scaler | random_oversampling | 0.825504 | 0.800325 | 0.841580 | 0.820427 | 0.393823 | 0.904097 |
In [98]:
### Plotting the performance metrics of the optimized models on the different telco churn dataframe.
plt.figure(figsize = (20, 15))
### Plotting Accuracy Performance
plt.subplot(2, 3, 1)
sns.barplot(x = 'classification_model', y = 'accuracy', data = diff_telco_results_df.sort_values(by = 'accuracy', ascending = False))
plt.title('Model Accuracy')
plt.xticks(rotation = 45, ha = 'right')
### Plotting Precision Performance
plt.subplot(2, 3, 2)
sns.barplot(x = 'classification_model', y = 'precision', data = diff_telco_results_df.sort_values(by = 'precision', ascending = False))
plt.title('Model Precision')
plt.xticks(rotation = 45, ha = 'right')
### Plotting Recall Performance
plt.subplot(2, 3, 3)
sns.barplot(x = 'classification_model', y = 'recall', data = diff_telco_results_df.sort_values(by = 'recall', ascending = False))
plt.title('Model Recall')
plt.xticks(rotation = 45, ha = 'right')
### Plotting F1 Score Performance
plt.subplot(2, 3, 4)
sns.barplot(x = 'classification_model', y = 'f1_score', data = diff_telco_results_df.sort_values(by = 'f1_score', ascending = False))
plt.title('Model F1 Score')
plt.xticks(rotation = 45, ha = 'right')
### Plotting ROC AUC Performance
plt.subplot(2, 3, 5)
sns.barplot(x = 'classification_model', y = 'roc_auc', data = diff_telco_results_df.sort_values(by = 'roc_auc', ascending = False))
plt.title('Model ROC AUC')
plt.xticks(rotation = 45, ha = 'right')
plt.tight_layout()
plt.show()
In [100]:
### Looping through each optimized model and creating a ROC curve using the different telco dataframe.
for index, row in diff_telco_results_df.iterrows():
diff_telco_model_name = row['classification_model']
diff_telco_scaler_name = row['scaling_technique']
diff_telco_sampler_name = row['sampling_technique']
diff_telco_combination_key = (diff_telco_model_name, diff_telco_scaler_name, diff_telco_sampler_name)
if diff_telco_combination_key in diff_telco_roc_data:
diff_telco_false_positive_rate, diff_telco_true_positive_rate = diff_telco_roc_data[diff_telco_combination_key]
diff_telco_combination_label = f"{diff_telco_model_name} | {diff_telco_scaler_name} | {diff_telco_sampler_name}"
plt.figure(figsize = (15, 10))
sns.lineplot(x = diff_telco_false_positive_rate, y = diff_telco_true_positive_rate, label = diff_telco_combination_label)
plt.plot([0, 1], [0, 1], 'k--')
plt.title(f"ROC Curve of Optimized {diff_telco_model_name} Model On Different Telco DF")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.legend(loc = "lower right")
plt.tight_layout()
plt.show()
In [102]:
### SHAP Feature Importances
### Extract the optimal XGBoost configuration from gridsearch_results_list
diff_telco_xgb_config = next(cfg for cfg in gridsearch_results_list if cfg['classification_model'] == 'xgboost_classifier')
### Initializing the XGBoost model with the optimized hyperparameters.
diff_telco_xgb_model = XGBClassifier(random_state = 22)
diff_telco_xgb_model.set_params(**diff_telco_xgb_config['best_hyperparameters'])
### Defining the input feature matrix (X) and the target variable vector (y).
diff_telco_X_SHAP = diff_telco_X.copy()
diff_telco_y_SHAP = diff_telco_y.copy()
### Applying the optimal scaling techniques.
if diff_telco_scaling_technique is not None:
diff_telco_X_SHAP[diff_telco_continuous_columns] = diff_telco_scaling_technique.fit_transform(diff_telco_X_SHAP[diff_telco_continuous_columns])
### Applying the optimal sampling techniques.
if diff_telco_sampling_technique is not None:
diff_telco_X_SHAP, diff_telco_y_SHAP = diff_telco_sampling_technique.fit_resample(diff_telco_X_SHAP, diff_telco_y_SHAP)
### Fitting the model.
diff_telco_xgb_model.fit(diff_telco_X_SHAP, diff_telco_y_SHAP)
### SHAP analysis
diff_telco_explainer = shap.TreeExplainer(diff_telco_xgb_model)
diff_telco_shap_values = diff_telco_explainer.shap_values(diff_telco_X_SHAP)
### Creating a feature importance dataframe.
diff_telco_shap_df = pd.DataFrame({
'Feature': diff_telco_X_SHAP.columns,
'Mean_SHAP_value': diff_telco_shap_values.mean(axis = 0),
'Mean_ABS_SHAP_value': np.abs(diff_telco_shap_values).mean(axis = 0)
}).sort_values('Mean_ABS_SHAP_value', ascending = False)
### Plotting the top 10 Mean SHAP Feature Contributions sorted by mean absolute shap values
plt.figure(figsize = (10, 6))
sns.barplot(data = diff_telco_shap_df.head(10), x = 'Mean_SHAP_value', y = 'Feature')
plt.title('Top 10 Mean SHAP Feature Contributions (via XGBoost)')
plt.xlabel('Mean SHAP Value (Direction + Magnitude)')
plt.ylabel('Features')
plt.axvline(0, color = 'black', linestyle = '--')
plt.tight_layout()
plt.show()
print('\n')
### Displaying the feature importance dataframe.
print('diff_telco_shap_df:')
display(diff_telco_shap_df)
print('\n')
diff_telco_shap_df:
| Feature | Mean_SHAP_value | Mean_ABS_SHAP_value | |
|---|---|---|---|
| 4 | Payment Delay | -0.744704 | 5.617241 |
| 3 | Support Calls | 1.060048 | 4.007386 |
| 2 | Usage Frequency | -1.902989 | 3.127197 |
| 1 | Tenure | 1.069654 | 2.915936 |
| 7 | Gender_Female | -0.151843 | 1.940470 |
| 5 | Total Spend | 0.245153 | 1.920382 |
| 0 | Age | 0.010967 | 1.456922 |
| 13 | Contract Length_Monthly | 0.179899 | 1.009675 |
| 12 | Contract Length_Annual | -0.187241 | 0.460857 |
| 8 | Gender_Male | -0.001544 | 0.401240 |
| 9 | Subscription Type_Basic | -0.077020 | 0.344382 |
| 14 | Contract Length_Quarterly | -0.006702 | 0.235833 |
| 6 | Last Interaction | 0.029638 | 0.118965 |
| 11 | Subscription Type_Standard | -0.000065 | 0.064951 |
| 10 | Subscription Type_Premium | -0.000066 | 0.017166 |
In [ ]:
###############################################################################################################################################
###############################################################################################################################################
###############################################################################################################################################
###############################################################################################################################################
In [104]:
### Loading a bank customer churn dataset to compare model performance on data from a different industry.
bank_churn_url = "https://raw.githubusercontent.com/adamcookeunc/DATA_780_PROJECT/refs/heads/main/Churn_Modelling.csv"
bank_churn_df = pd.read_csv(bank_churn_url)
bank_churn_df.head()
Out[104]:
| RowNumber | CustomerId | Surname | CreditScore | Geography | Gender | Age | Tenure | Balance | NumOfProducts | HasCrCard | IsActiveMember | EstimatedSalary | Exited | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 15634602 | Hargrave | 619 | France | Female | 42 | 2 | 0.00 | 1 | 1 | 1 | 101348.88 | 1 |
| 1 | 2 | 15647311 | Hill | 608 | Spain | Female | 41 | 1 | 83807.86 | 1 | 0 | 1 | 112542.58 | 0 |
| 2 | 3 | 15619304 | Onio | 502 | France | Female | 42 | 8 | 159660.80 | 3 | 1 | 0 | 113931.57 | 1 |
| 3 | 4 | 15701354 | Boni | 699 | France | Female | 39 | 1 | 0.00 | 2 | 0 | 0 | 93826.63 | 0 |
| 4 | 5 | 15737888 | Mitchell | 850 | Spain | Female | 43 | 2 | 125510.82 | 1 | 1 | 1 | 79084.10 | 0 |
In [106]:
### Dropping irrelavent coolumns.
bank_churn_df = bank_churn_df.drop(columns = ['RowNumber', 'CustomerId', 'Surname'])
### Ensuring that all numerical columns are in the float 64 formatting.
bank_churn_df['CreditScore'] = bank_churn_df['CreditScore'].astype(float)
bank_churn_df['Age'] = bank_churn_df['Age'].astype(float)
bank_churn_df['Tenure'] = bank_churn_df['Tenure'].astype(float)
bank_churn_df['Balance'] = bank_churn_df['Balance'].astype(float)
bank_churn_df['Tenure'] = bank_churn_df['Tenure'].astype(float)
bank_churn_df['NumOfProducts'] = bank_churn_df['NumOfProducts'].astype(float)
bank_churn_df['HasCrCard'] = bank_churn_df['HasCrCard'].astype(float)
bank_churn_df['IsActiveMember'] = bank_churn_df['IsActiveMember'].astype(float)
bank_churn_df['EstimatedSalary'] = bank_churn_df['EstimatedSalary'].astype(float)
bank_churn_df['Exited'] = bank_churn_df['Exited'].astype(float)
### One-Hot-Encoding categorical/nominal columns
bank_churn_df_encoded = pd.get_dummies(bank_churn_df, columns = ['Geography', 'Gender'], dtype = float)
### Excluding the churn column (the column that we are predicting)
bank_churn_df_encoded_excluding_churn = bank_churn_df_encoded.drop(columns = ['Exited'])
In [108]:
### Summary statistics
print("Summary statistics:")
display(bank_churn_df.describe(include = 'all'))
print('\n')
### Data types
print("Data types:")
display(bank_churn_df.dtypes)
print('\n')
### Number of missing values
print("Number of missing values:")
display(bank_churn_df.isna().sum())
print('\n')
### Number of unique values
print("Number of unique values:")
display(bank_churn_df.nunique())
print('\n')
Summary statistics:
| CreditScore | Geography | Gender | Age | Tenure | Balance | NumOfProducts | HasCrCard | IsActiveMember | EstimatedSalary | Exited | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 10000.000000 | 10000 | 10000 | 10000.000000 | 10000.000000 | 10000.000000 | 10000.000000 | 10000.00000 | 10000.000000 | 10000.000000 | 10000.000000 |
| unique | NaN | 3 | 2 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| top | NaN | France | Male | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| freq | NaN | 5014 | 5457 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| mean | 650.528800 | NaN | NaN | 38.921800 | 5.012800 | 76485.889288 | 1.530200 | 0.70550 | 0.515100 | 100090.239881 | 0.203700 |
| std | 96.653299 | NaN | NaN | 10.487806 | 2.892174 | 62397.405202 | 0.581654 | 0.45584 | 0.499797 | 57510.492818 | 0.402769 |
| min | 350.000000 | NaN | NaN | 18.000000 | 0.000000 | 0.000000 | 1.000000 | 0.00000 | 0.000000 | 11.580000 | 0.000000 |
| 25% | 584.000000 | NaN | NaN | 32.000000 | 3.000000 | 0.000000 | 1.000000 | 0.00000 | 0.000000 | 51002.110000 | 0.000000 |
| 50% | 652.000000 | NaN | NaN | 37.000000 | 5.000000 | 97198.540000 | 1.000000 | 1.00000 | 1.000000 | 100193.915000 | 0.000000 |
| 75% | 718.000000 | NaN | NaN | 44.000000 | 7.000000 | 127644.240000 | 2.000000 | 1.00000 | 1.000000 | 149388.247500 | 0.000000 |
| max | 850.000000 | NaN | NaN | 92.000000 | 10.000000 | 250898.090000 | 4.000000 | 1.00000 | 1.000000 | 199992.480000 | 1.000000 |
Data types:
CreditScore float64 Geography object Gender object Age float64 Tenure float64 Balance float64 NumOfProducts float64 HasCrCard float64 IsActiveMember float64 EstimatedSalary float64 Exited float64 dtype: object
Number of missing values:
CreditScore 0 Geography 0 Gender 0 Age 0 Tenure 0 Balance 0 NumOfProducts 0 HasCrCard 0 IsActiveMember 0 EstimatedSalary 0 Exited 0 dtype: int64
Number of unique values:
CreditScore 460 Geography 3 Gender 2 Age 70 Tenure 11 Balance 6382 NumOfProducts 4 HasCrCard 2 IsActiveMember 2 EstimatedSalary 9999 Exited 2 dtype: int64
In [110]:
### Distribution of churn (the target variable) on the bank churn dataframe.
plt.figure(figsize = (8, 6))
sns.countplot(x = 'Exited', data = bank_churn_df)
plt.title('Churn Distribution of a Bank Churn DF')
plt.show()
print('\n')
print("Churn Distribution:")
print(bank_churn_df['Exited'].value_counts())
print('\n')
print("Churn Distribution Percentages:")
print(bank_churn_df['Exited'].value_counts(normalize = True) * 100)
print('\n')
Churn Distribution: Exited 0.0 7963 1.0 2037 Name: count, dtype: int64 Churn Distribution Percentages: Exited 0.0 79.63 1.0 20.37 Name: proportion, dtype: float64
In [112]:
### Distribution of Bank Churn DF Numerical Features
plt.figure(figsize = (18, 13))
### Age Distribution
plt.subplot(2, 3, 1)
sns.histplot(bank_churn_df['CreditScore'], kde = True)
plt.axvline(bank_churn_df['CreditScore'].mean(), color = 'red', linestyle = '--', label = 'Mean')
plt.axvline(bank_churn_df['CreditScore'].median(), color = 'green', linestyle = '-', label = 'Median')
plt.legend()
plt.title('Age Distribution')
### Age Distribution
plt.subplot(2, 3, 2)
sns.histplot(bank_churn_df['Age'], kde = True)
plt.axvline(bank_churn_df['Age'].mean(), color = 'red', linestyle = '--', label = 'Mean')
plt.axvline(bank_churn_df['Age'].median(), color = 'green', linestyle = '-', label = 'Median')
plt.legend()
plt.title('Age Distribution')
### Tenure Distribution
plt.subplot(2, 3, 3)
sns.histplot(bank_churn_df['Tenure'], kde = True)
plt.axvline(bank_churn_df['Tenure'].mean(), color = 'red', linestyle = '--', label = 'Mean')
plt.axvline(bank_churn_df['Tenure'].median(), color = 'green', linestyle = '-', label = 'Median')
plt.legend()
plt.title('Tenure Distribution')
### Balance Distribution
plt.subplot(2, 3, 4)
sns.histplot(bank_churn_df['Balance'], kde = True)
plt.axvline(bank_churn_df['Balance'].mean(), color = 'red', linestyle = '--', label = 'Mean')
plt.axvline(bank_churn_df['Balance'].median(), color = 'green', linestyle = '-', label = 'Median')
plt.legend()
plt.title('Balance Distribution')
### Number of Products Distribution
plt.subplot(2, 3, 5)
sns.histplot(bank_churn_df['NumOfProducts'], kde = True)
plt.axvline(bank_churn_df['NumOfProducts'].mean(), color = 'red', linestyle = '--', label = 'Mean')
plt.axvline(bank_churn_df['NumOfProducts'].median(), color = 'green', linestyle = '-', label = 'Median')
plt.legend()
plt.title('Number of Products Distribution')
### Estimated Salary Distribution
plt.subplot(2, 3, 6)
sns.histplot(bank_churn_df['EstimatedSalary'], kde = True)
plt.axvline(bank_churn_df['EstimatedSalary'].mean(), color = 'red', linestyle = '--', label = 'Mean')
plt.axvline(bank_churn_df['EstimatedSalary'].median(), color = 'green', linestyle = '-', label = 'Median')
plt.legend()
plt.title('Estimated Salary Distribution')
plt.tight_layout()
plt.show()
In [114]:
### Churn vs Bank Churn DF Numerical Features
plt.figure(figsize = (18, 13))
### Churn vs Credit Score
plt.subplot(2, 3, 1)
sns.boxplot(x = 'Exited', y = 'CreditScore', data = bank_churn_df)
plt.title('Churn vs Credit Score')
### Churn vs Age
plt.subplot(2, 3, 2)
sns.boxplot(x = 'Exited', y = 'Age', data = bank_churn_df)
plt.title('Churn vs Age')
### Churn vs Tenure
plt.subplot(2, 3, 3)
sns.boxplot(x = 'Exited', y = 'Tenure', data = bank_churn_df)
plt.title('Churn vs Tenure')
### Churn vs Balance
plt.subplot(2, 3, 4)
sns.boxplot(x = 'Exited', y = 'Balance', data = bank_churn_df)
plt.title('Churn vs Balance')
### Churn vs Number of Products
plt.subplot(2, 3, 5)
sns.boxplot(x = 'Exited', y = 'NumOfProducts', data = bank_churn_df)
plt.title('Churn vs Number of Products')
### Churn vs Estimated Salary
plt.subplot(2, 3, 6)
sns.boxplot(x = 'Exited', y = 'EstimatedSalary', data = bank_churn_df)
plt.title('Churn vs Estimated Salary')
plt.tight_layout()
plt.show()
In [116]:
### Churn vs Bank Churn DF Categorical Features
plt.figure(figsize = (15, 10))
### Churn vs Geography
plt.subplot(2, 2, 1)
sns.countplot(x = 'Geography', hue = 'Exited', data = bank_churn_df)
plt.title('Churn vs Geography')
plt.xticks(rotation = 45, ha = 'right')
### Churn vs Gender
plt.subplot(2, 2, 2)
sns.countplot(x = 'Gender', hue = 'Exited', data = bank_churn_df)
plt.title('Churn vs Gender')
plt.xticks(rotation = 45, ha = 'right')
### Churn vs Has Credit Card
plt.subplot(2, 2, 3)
sns.countplot(x = 'HasCrCard', hue = 'Exited', data = bank_churn_df)
plt.title('Churn vs Has Credit Card ')
plt.xticks(rotation = 45, ha = 'right')
### Churn vs Is Active Member
plt.subplot(2, 2, 4)
sns.countplot(x = 'IsActiveMember', hue = 'Exited', data = bank_churn_df)
plt.title('Churn vs Is Active Member')
plt.xticks(rotation = 45, ha = 'right')
plt.tight_layout()
plt.show()
In [120]:
### Rerun optimized models on the bank churn dataframe.
### Defining the input feature matrix (X) and the target variable vector (y).
bank_churn_X = bank_churn_df_encoded_excluding_churn.copy()
bank_churn_y = bank_churn_df_encoded['Exited']
### Creating a list of the continuous features that we will be scaling, we do not want to scale the binary indicator columns only the continuous numerical columns.
bank_churn_continuous_columns = [
'CreditScore',
'Age',
'Tenure',
'Balance',
'NumOfProducts',
'EstimatedSalary'
]
#############################################################################################################################################################################################
### Defining Sampling/Scaling techniques and the classification models we are going to be comparing.
bank_churn_classification_models = {
'logistic_regression': LogisticRegression(random_state = 22),
'support_vector_classifier': SVC(probability = True, random_state = 22),
'decision_tree_classifier': DecisionTreeClassifier(random_state = 22),
'random_forest_classifier': RandomForestClassifier(random_state = 22),
'gaussian_naive_bayes_classifier': GaussianNB(),
'xgboost_classifier': XGBClassifier(random_state = 22)
}
bank_churn_scaling_techniques = {
'no_scaling': None,
'standard_scaler': StandardScaler(),
'robust_scaler': RobustScaler(),
'min_max_scaler': MinMaxScaler()
}
bank_churn_sampling_techniques = {
'no_sampling': None,
'random_undersampling': RandomUnderSampler(random_state = 22),
'random_oversampling': RandomOverSampler(random_state = 22),
'smote': SMOTE(random_state = 22)
}
#############################################################################################################################################################################################
### Optimized model configurations from gridsearch_results_list
bank_churn_optimized_models = {
entry['classification_model']: {
'scaling_technique': entry['scaling_technique'],
'sampling_technique': entry['sampling_technique'],
'best_hyperparameters': entry['best_hyperparameters']
}
for entry in gridsearch_results_list
}
#############################################################################################################################################################################################
### Setting up KFold crossvalidation with 5 splits.
bank_churn_cross_validation_split = KFold(n_splits = 5, shuffle = True, random_state = 22)
### Creating containers to store average model metrics information and ROC curve information (false positive rate and true positive rate arrays).
bank_churn_average_model_metrics = []
bank_churn_roc_data = {}
### Looping through each combination of model, scaling technique, and sampling technique.
for bank_churn_classification_model_name, bank_churn_classification_model_config in bank_churn_optimized_models.items():
### Getting classification model and set optimized hyperparameters
bank_churn_classification_model = bank_churn_classification_models[bank_churn_classification_model_name]
bank_churn_classification_model.set_params(**bank_churn_classification_model_config['best_hyperparameters'])
### Get scaling and sampling techniques
bank_churn_scaling_technique = bank_churn_scaling_techniques[bank_churn_classification_model_config['scaling_technique']]
bank_churn_sampling_technique = bank_churn_sampling_techniques[bank_churn_classification_model_config['sampling_technique']]
### Creating containers to store model metric information for each cross validation fold.
bank_churn_accuracy_scores_per_fold = []
bank_churn_precision_scores_per_fold = []
bank_churn_recall_scores_per_fold = []
bank_churn_f1_scores_per_fold = []
bank_churn_log_loss_values_per_fold = []
bank_churn_roc_auc_scores_per_fold = []
### Creating containers to store true labels and predicted probabilities across all folds for ROC curve calculations.
bank_churn_all_true_labels = []
bank_churn_all_probabilities = []
### Setting up cross validation dataset splits
for bank_churn_train_indexes, bank_churn_test_indexes in bank_churn_cross_validation_split.split(bank_churn_X, bank_churn_y):
bank_churn_X_train = bank_churn_X.iloc[bank_churn_train_indexes].copy()
bank_churn_X_test = bank_churn_X.iloc[bank_churn_test_indexes].copy()
bank_churn_y_train = bank_churn_y.iloc[bank_churn_train_indexes].copy()
bank_churn_y_test = bank_churn_y.iloc[bank_churn_test_indexes].copy()
### Applying the scaling techniques to only the continuous features defined above.
if bank_churn_scaling_technique is not None:
bank_churn_X_train.loc[:, bank_churn_continuous_columns] = bank_churn_scaling_technique.fit_transform(bank_churn_X_train[bank_churn_continuous_columns])
bank_churn_X_test.loc[:, bank_churn_continuous_columns] = bank_churn_scaling_technique.transform(bank_churn_X_test[bank_churn_continuous_columns])
### Applying the sampling techniques.
if bank_churn_sampling_technique is not None:
bank_churn_X_train, bank_churn_y_train = bank_churn_sampling_technique.fit_resample(bank_churn_X_train, bank_churn_y_train)
### Training the classification model.
bank_churn_classification_model.fit(bank_churn_X_train, bank_churn_y_train)
### Generating predictions and probabilities of churn on the testing dataset.
bank_churn_y_predicted_labels = bank_churn_classification_model.predict(bank_churn_X_test)
bank_churn_y_predicted_probabilities = bank_churn_classification_model.predict_proba(bank_churn_X_test)[:, 1]
### Calculating performance metrics for each cross validation fold.
bank_churn_accuracy_scores_per_fold.append(accuracy_score(bank_churn_y_test, bank_churn_y_predicted_labels))
bank_churn_precision_scores_per_fold.append(precision_score(bank_churn_y_test, bank_churn_y_predicted_labels))
bank_churn_recall_scores_per_fold.append(recall_score(bank_churn_y_test, bank_churn_y_predicted_labels))
bank_churn_f1_scores_per_fold.append(f1_score(bank_churn_y_test, bank_churn_y_predicted_labels))
bank_churn_log_loss_values_per_fold.append(log_loss(bank_churn_y_test, bank_churn_y_predicted_probabilities))
bank_churn_roc_auc_scores_per_fold.append(roc_auc_score(bank_churn_y_test, bank_churn_y_predicted_probabilities))
### Collecting data for ROC curve
bank_churn_all_true_labels.extend(bank_churn_y_test.tolist())
bank_churn_all_probabilities.extend(bank_churn_y_predicted_probabilities.tolist())
### Computing a single ROC curve for each combination across all folds.
bank_churn_false_positive_rates, bank_churn_true_positive_rates, _ = roc_curve(bank_churn_all_true_labels, bank_churn_all_probabilities)
### Storing averaged metrics across all folds for each combination.
bank_churn_average_model_metrics.append({
'classification_model': bank_churn_classification_model_name,
'scaling_technique': bank_churn_classification_model_config['scaling_technique'],
'sampling_technique': bank_churn_classification_model_config['sampling_technique'],
'accuracy': np.mean(bank_churn_accuracy_scores_per_fold),
'precision': np.mean(bank_churn_precision_scores_per_fold),
'recall': np.mean(bank_churn_recall_scores_per_fold),
'f1_score': np.mean(bank_churn_f1_scores_per_fold),
'log_loss': np.mean(bank_churn_log_loss_values_per_fold),
'roc_auc': np.mean(bank_churn_roc_auc_scores_per_fold)
})
### Storing ROC curve information for plotting
bank_churn_roc_data[(bank_churn_classification_model_name, bank_churn_classification_model_config['scaling_technique'], bank_churn_classification_model_config['sampling_technique'])] = (bank_churn_false_positive_rates, bank_churn_true_positive_rates)
### Converting to DataFrame
bank_churn_results_df = pd.DataFrame(bank_churn_average_model_metrics)
print("Performance on a Bank Churn DF:")
display(bank_churn_results_df.sort_values(by = 'f1_score', ascending = False))
Performance on a Bank Churn DF:
| classification_model | scaling_technique | sampling_technique | accuracy | precision | recall | f1_score | log_loss | roc_auc | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | xgboost_classifier | no_scaling | random_oversampling | 0.8384 | 0.607759 | 0.584399 | 0.595701 | 0.422642 | 0.836589 |
| 1 | random_forest_classifier | standard_scaler | random_undersampling | 0.7810 | 0.476573 | 0.756371 | 0.584504 | 0.474847 | 0.856955 |
| 5 | decision_tree_classifier | standard_scaler | random_undersampling | 0.7449 | 0.430041 | 0.761398 | 0.548784 | 0.618796 | 0.830766 |
| 4 | gaussian_naive_bayes_classifier | standard_scaler | smote | 0.7361 | 0.408822 | 0.661405 | 0.505227 | 0.549287 | 0.781643 |
| 3 | logistic_regression | standard_scaler | random_oversampling | 0.7117 | 0.383714 | 0.687141 | 0.492409 | 0.577906 | 0.768511 |
| 2 | support_vector_classifier | standard_scaler | random_oversampling | 0.7991 | 0.509257 | 0.361364 | 0.422674 | 0.698014 | 0.764247 |
In [125]:
### Plotting the performance metrics of the optimized models on the bank churn dataframe.
plt.figure(figsize = (20, 15))
### Plotting Accuracy Performance
plt.subplot(2, 3, 1)
sns.barplot(x = 'classification_model', y = 'accuracy', data = bank_churn_results_df.sort_values(by = 'accuracy', ascending = False))
plt.title('Model Accuracy')
plt.xticks(rotation = 45, ha = 'right')
### Plotting Precision Performance
plt.subplot(2, 3, 2)
sns.barplot(x = 'classification_model', y = 'precision', data = bank_churn_results_df.sort_values(by = 'precision', ascending = False))
plt.title('Model Precision')
plt.xticks(rotation = 45, ha = 'right')
### Plotting Recall Performance
plt.subplot(2, 3, 3)
sns.barplot(x = 'classification_model', y = 'recall', data = bank_churn_results_df.sort_values(by = 'recall', ascending = False))
plt.title('Model Recall')
plt.xticks(rotation = 45, ha = 'right')
### Plotting F1 Score Performance
plt.subplot(2, 3, 4)
sns.barplot(x = 'classification_model', y = 'f1_score', data = bank_churn_results_df.sort_values(by = 'f1_score', ascending = False))
plt.title('Model F1 Score')
plt.xticks(rotation = 45, ha = 'right')
### Plotting ROC AUC Performance
plt.subplot(2, 3, 5)
sns.barplot(x = 'classification_model', y = 'roc_auc', data = bank_churn_results_df.sort_values(by = 'roc_auc', ascending = False))
plt.title('Model ROC AUC')
plt.xticks(rotation = 45, ha = 'right')
plt.tight_layout()
plt.show()
In [128]:
### Looping through each optimized model and creating a ROC curve using the bank churn dataframe.
for index, row in bank_churn_results_df.iterrows():
bank_churn_model_name = row['classification_model']
bank_churn_scaler_name = row['scaling_technique']
bank_churn_sampler_name = row['sampling_technique']
bank_churn_combination_key = (bank_churn_model_name, bank_churn_scaler_name, bank_churn_sampler_name)
if bank_churn_combination_key in bank_churn_roc_data:
bank_churn_false_positive_rate, bank_churn_true_positive_rate = bank_churn_roc_data[bank_churn_combination_key]
bank_churn_combination_label = f"{bank_churn_model_name} | {bank_churn_scaler_name} | {bank_churn_sampler_name}"
plt.figure(figsize = (15, 10))
sns.lineplot(x = bank_churn_false_positive_rate, y = bank_churn_true_positive_rate, label = bank_churn_combination_label)
plt.plot([0, 1], [0, 1], 'k--')
plt.title(f"ROC Curve of Optimized {bank_churn_model_name} Model On Bank Churn DF")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.legend(loc = "lower right")
plt.tight_layout()
plt.show()
In [130]:
### SHAP Feature Importances
### Extract the optimal XGBoost configuration from gridsearch_results_list
bank_churn_xgb_config = next(cfg for cfg in gridsearch_results_list if cfg['classification_model'] == 'xgboost_classifier')
### Initializing the XGBoost model with the optimized hyperparameters.
bank_churn_xgb_model = XGBClassifier(random_state = 22)
bank_churn_xgb_model.set_params(**bank_churn_xgb_config['best_hyperparameters'])
### Defining the input feature matrix (X) and the target variable vector (y).
bank_churn_X_SHAP = bank_churn_X.copy()
bank_churn_y_SHAP = bank_churn_y.copy()
### Applying the optimal scaling techniques.
if bank_churn_scaling_technique is not None:
bank_churn_X_SHAP[bank_churn_continuous_columns] = bank_churn_scaling_technique.fit_transform(bank_churn_X_SHAP[bank_churn_continuous_columns])
### Applying the optimal sampling techniques.
if bank_churn_sampling_technique is not None:
bank_churn_X_SHAP, bank_churn_y_SHAP = bank_churn_sampling_technique.fit_resample(bank_churn_X_SHAP, bank_churn_y_SHAP)
### Fitting the model.
bank_churn_xgb_model.fit(bank_churn_X_SHAP, bank_churn_y_SHAP)
### SHAP analysis
bank_churn_explainer = shap.TreeExplainer(bank_churn_xgb_model)
bank_churn_shap_values = bank_churn_explainer.shap_values(bank_churn_X_SHAP)
### Creating a feature importance dataframe.
bank_churn_shap_df = pd.DataFrame({
'Feature': bank_churn_X_SHAP.columns,
'Mean_SHAP_value': bank_churn_shap_values.mean(axis = 0),
'Mean_ABS_SHAP_value': np.abs(bank_churn_shap_values).mean(axis = 0)
}).sort_values('Mean_ABS_SHAP_value', ascending = False)
### Plotting the top 10 Mean SHAP Feature Contributions sorted by mean absolute shap values
plt.figure(figsize = (10, 6))
sns.barplot(data = bank_churn_shap_df.head(10), x = 'Mean_SHAP_value', y = 'Feature')
plt.title('Top 10 Mean SHAP Feature Contributions (via XGBoost)')
plt.xlabel('Mean SHAP Value (Direction + Magnitude)')
plt.ylabel('Features')
plt.axvline(0, color = 'black', linestyle = '--')
plt.tight_layout()
plt.show()
print('\n')
### Displaying the feature importance dataframe.
print('bank_churn_shap_df:')
display(bank_churn_shap_df)
print('\n')
bank_churn_shap_df:
| Feature | Mean_SHAP_value | Mean_ABS_SHAP_value | |
|---|---|---|---|
| 1 | Age | 0.121641 | 1.509703 |
| 4 | NumOfProducts | 0.118869 | 1.265618 |
| 6 | IsActiveMember | 0.020889 | 0.770891 |
| 3 | Balance | 0.009897 | 0.629049 |
| 7 | EstimatedSalary | -0.040298 | 0.543720 |
| 0 | CreditScore | 0.007980 | 0.538873 |
| 9 | Geography_Germany | 0.031302 | 0.305719 |
| 2 | Tenure | -0.011614 | 0.297383 |
| 11 | Gender_Female | -0.016330 | 0.293192 |
| 8 | Geography_France | -0.002522 | 0.145191 |
| 5 | HasCrCard | 0.000704 | 0.097508 |
| 12 | Gender_Male | -0.003767 | 0.095222 |
| 10 | Geography_Spain | 0.003395 | 0.083521 |
In [188]:
### Comparing model performance across mulitple datasets.
### Sorting all results dataframes by classification_model to ensure consistent row order
gridsearch_results_dataframe = gridsearch_results_dataframe.sort_values('classification_model')
telco_churn_engineered_features_results_df = telco_churn_engineered_features_results_df.sort_values('classification_model')
diff_telco_results_df = diff_telco_results_df.sort_values('classification_model')
bank_churn_results_df = bank_churn_results_df.sort_values('classification_model')
### Creating comparison DataFrames for each performance metric
accuracy_score_df = pd.DataFrame({
'Original': gridsearch_results_dataframe['accuracy'].values,
'Telco_Engineered': telco_churn_engineered_features_results_df['accuracy'].values,
'Telco_Diff': diff_telco_results_df['accuracy'].values,
'Bank_Churn': bank_churn_results_df['accuracy'].values
}, index = gridsearch_results_dataframe['classification_model'])
accuracy_score_df = accuracy_score_df.sort_values(by = 'Original', ascending = False)
accuracy_score_df.index.name = 'classification_model'
precision_score_df = pd.DataFrame({
'Original': gridsearch_results_dataframe['precision'].values,
'Telco_Engineered': telco_churn_engineered_features_results_df['precision'].values,
'Telco_Diff': diff_telco_results_df['precision'].values,
'Bank_Churn': bank_churn_results_df['precision'].values
}, index = gridsearch_results_dataframe['classification_model'])
precision_score_df = precision_score_df.sort_values(by = 'Original', ascending = False)
precision_score_df.index.name = 'classification_model'
recall_score_df = pd.DataFrame({
'Original': gridsearch_results_dataframe['recall'].values,
'Telco_Engineered': telco_churn_engineered_features_results_df['recall'].values,
'Telco_Diff': diff_telco_results_df['recall'].values,
'Bank_Churn': bank_churn_results_df['recall'].values
}, index = gridsearch_results_dataframe['classification_model'])
recall_score_df = recall_score_df.sort_values(by = 'Original', ascending = False)
recall_score_df.index.name = 'classification_model'
f1_score_df = pd.DataFrame({
'Original': gridsearch_results_dataframe['f1_score'].values,
'Telco_Engineered': telco_churn_engineered_features_results_df['f1_score'].values,
'Telco_Diff': diff_telco_results_df['f1_score'].values,
'Bank_Churn': bank_churn_results_df['f1_score'].values
}, index = gridsearch_results_dataframe['classification_model'])
f1_score_df = f1_score_df.sort_values(by = 'Original', ascending = False)
f1_score_df.index.name = 'classification_model'
roc_auc_score_df = pd.DataFrame({
'Original': gridsearch_results_dataframe['roc_auc'].values,
'Telco_Engineered': telco_churn_engineered_features_results_df['roc_auc'].values,
'Telco_Diff': diff_telco_results_df['roc_auc'].values,
'Bank_Churn': bank_churn_results_df['roc_auc'].values
}, index = gridsearch_results_dataframe['classification_model'])
roc_auc_score_df = roc_auc_score_df.sort_values(by = 'Original', ascending = False)
roc_auc_score_df.index.name = 'classification_model'
In [190]:
### Displaying all perfomance results dataframes for comparison across multiple datasets
print("accuracy_score_df:")
display(accuracy_score_df)
print("\n")
print("precision_score_df:")
display(precision_score_df)
print("\n")
print("recall_score_df:")
display(recall_score_df)
print("\n")
print("f1_score_df:")
display(f1_score_df)
print("\n")
print("roc_auc_score_df:")
display(roc_auc_score_df)
print("\n")
accuracy_score_df:
| Original | Telco_Engineered | Telco_Diff | Bank_Churn | |
|---|---|---|---|---|
| classification_model | ||||
| support_vector_classifier | 0.933082 | 0.733077 | 0.942228 | 0.7991 |
| xgboost_classifier | 0.914294 | 0.817972 | 0.999922 | 0.8384 |
| logistic_regression | 0.799632 | 0.777588 | 0.825504 | 0.7117 |
| random_forest_classifier | 0.796956 | 0.777302 | 0.998742 | 0.7810 |
| gaussian_naive_bayes_classifier | 0.787236 | 0.778297 | 0.835151 | 0.7361 |
| decision_tree_classifier | 0.776082 | 0.747298 | 0.956613 | 0.7449 |
precision_score_df:
| Original | Telco_Engineered | Telco_Diff | Bank_Churn | |
|---|---|---|---|---|
| classification_model | ||||
| support_vector_classifier | 0.960004 | 0.459865 | 0.934457 | 0.509257 |
| xgboost_classifier | 0.874552 | 0.655546 | 0.999869 | 0.607759 |
| logistic_regression | 0.781263 | 0.555098 | 0.800325 | 0.383714 |
| gaussian_naive_bayes_classifier | 0.775824 | 0.571152 | 0.800956 | 0.408822 |
| random_forest_classifier | 0.767085 | 0.552262 | 0.998261 | 0.476573 |
| decision_tree_classifier | 0.744953 | 0.516246 | 0.930023 | 0.430041 |
recall_score_df:
| Original | Telco_Engineered | Telco_Diff | Bank_Churn | |
|---|---|---|---|---|
| classification_model | ||||
| xgboost_classifier | 0.967230 | 0.667395 | 0.999968 | 0.584399 |
| support_vector_classifier | 0.903858 | 0.019299 | 0.944284 | 0.361364 |
| random_forest_classifier | 0.854196 | 0.860853 | 0.999082 | 0.756371 |
| decision_tree_classifier | 0.842526 | 0.846182 | 0.982346 | 0.761398 |
| logistic_regression | 0.832183 | 0.821599 | 0.841580 | 0.687141 |
| gaussian_naive_bayes_classifier | 0.808266 | 0.687393 | 0.867575 | 0.661405 |
f1_score_df:
| Original | Telco_Engineered | Telco_Diff | Bank_Churn | |
|---|---|---|---|---|
| classification_model | ||||
| support_vector_classifier | 0.931057 | 0.036977 | 0.939341 | 0.422674 |
| xgboost_classifier | 0.918555 | 0.661047 | 0.999918 | 0.595701 |
| random_forest_classifier | 0.807968 | 0.672624 | 0.998671 | 0.584504 |
| logistic_regression | 0.805904 | 0.662332 | 0.820427 | 0.492409 |
| gaussian_naive_bayes_classifier | 0.791667 | 0.622214 | 0.832931 | 0.505227 |
| decision_tree_classifier | 0.789896 | 0.640395 | 0.955465 | 0.548784 |
roc_auc_score_df:
| Original | Telco_Engineered | Telco_Diff | Bank_Churn | |
|---|---|---|---|---|
| classification_model | ||||
| xgboost_classifier | 0.971051 | 0.882279 | 0.999992 | 0.836589 |
| support_vector_classifier | 0.967525 | 0.764371 | 0.987544 | 0.764247 |
| logistic_regression | 0.887956 | 0.881479 | 0.904097 | 0.768511 |
| random_forest_classifier | 0.878474 | 0.884096 | 0.999978 | 0.856955 |
| gaussian_naive_bayes_classifier | 0.869700 | 0.838566 | 0.908518 | 0.781643 |
| decision_tree_classifier | 0.848311 | 0.839351 | 0.994410 | 0.830766 |
In [198]:
### Plotting the perfomance results for comparison across multiple datasets
plt.figure(figsize = (15, 30))
plt.subplot(5, 1, 1)
sns.barplot(data = accuracy_score_df.reset_index().melt(id_vars = 'classification_model', var_name = 'Dataset', value_name = 'Score'),
x = 'classification_model', y = 'Score', hue = 'Dataset')
plt.title('Accuracy Score Comparison')
plt.xlabel('Model')
plt.ylabel('Score')
plt.xticks(rotation = 45, ha = 'right')
plt.legend(loc = 'lower right')
plt.subplot(5, 1, 2)
sns.barplot(data = precision_score_df.reset_index().melt(id_vars = 'classification_model', var_name = 'Dataset', value_name = 'Score'),
x = 'classification_model', y = 'Score', hue = 'Dataset')
plt.title('Precision Score Comparison')
plt.xlabel('Model')
plt.ylabel('Score')
plt.xticks(rotation = 45, ha = 'right')
plt.legend(loc = 'lower right')
plt.subplot(5, 1, 3)
sns.barplot(data = recall_score_df.reset_index().melt(id_vars = 'classification_model', var_name = 'Dataset', value_name = 'Score'),
x = 'classification_model', y = 'Score', hue = 'Dataset')
plt.title('Recall Score Comparison')
plt.xlabel('Model')
plt.ylabel('Score')
plt.xticks(rotation = 45, ha = 'right')
plt.legend(loc = 'lower right')
plt.subplot(5, 1, 4)
sns.barplot(data = f1_score_df.reset_index().melt(id_vars = 'classification_model', var_name = 'Dataset', value_name = 'Score'),
x = 'classification_model', y = 'Score', hue = 'Dataset')
plt.title('F1 Score Comparison')
plt.xlabel('Model')
plt.ylabel('Score')
plt.xticks(rotation = 45, ha = 'right')
plt.legend(loc = 'lower right')
plt.subplot(5, 1, 5)
sns.barplot(data = roc_auc_score_df.reset_index().melt(id_vars = 'classification_model', var_name = 'Dataset', value_name = 'Score'),
x = 'classification_model', y = 'Score', hue = 'Dataset')
plt.title('ROC AUC Score Comparison')
plt.xlabel('Model')
plt.ylabel('Score')
plt.xticks(rotation = 45, ha = 'right')
plt.legend(loc = 'lower right')
plt.tight_layout()
plt.show()
In [ ]: